# 패키지 및 dependency 설치

In [None]:
!git clone https://github.com/e9t/nsmc

Cloning into 'nsmc'...
remote: Enumerating objects: 14763, done.[K
remote: Total 14763 (delta 0), reused 0 (delta 0), pack-reused 14763[K
Receiving objects: 100% (14763/14763), 56.19 MiB | 24.39 MiB/s, done.
Resolving deltas: 100% (1749/1749), done.
Updating files: 100% (14737/14737), done.


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.1-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m77.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m

#import 

In [None]:
import pandas as pd
from transformers import BertTokenizer
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup

#하이퍼파라미터

In [None]:
# Hyperparameters
batch_size = 32
epochs = 5
learning_rate = 2e-5
warmup_steps = 100
max_seq_length = 128
seed = 77777

#데이터셋

In [None]:
class NSMCDataset:
    def __init__(self, file_path, tokenizer, max_length):
        self.data = pd.read_csv(file_path, sep='\t')
        self.sentences = self.data['document'].tolist()
        self.labels = self.data['label'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.labels[idx]
        if pd.isnull(sentence) or pd.isnull(label):
          return self.__getitem__((idx + 1) % len(self))
        encoded_input = self.tokenizer.encode_plus(
            text=sentence,
            add_special_tokens=True,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        input_ids = encoded_input['input_ids'].squeeze()
        attention_mask = encoded_input['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': label
        }

# 데이터 로드 및 전처리
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
train_dataset = NSMCDataset("/content/nsmc/ratings_train.txt", tokenizer, max_length=max_seq_length)
test_dataset = NSMCDataset("/content/nsmc/ratings_test.txt", tokenizer, max_length=max_seq_length)


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [None]:
# 데이터 로더 설정
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

#학습

In [None]:
# 모델 초기화
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)
model.cuda()

# Optimizer 및 스케줄러 설정
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataset) * epochs // batch_size
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)


# 학습
for epoch in range(epochs):
    model.train()
    total_loss = 0

    try :
      for batch in train_loader:
        input_ids = batch['input_ids'].cuda()
        attention_mask = batch['attention_mask'].cuda()
        labels = batch['label'].cuda()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

      avg_loss = total_loss / len(train_loader)
      print(f"Epoch {epoch+1}/{epochs} - Average Loss: {avg_loss:.4f}")
    except :
      print(batch)
# 테스트
model.eval()
total_correct = 0
total_samples = 0

for batch in test_loader:
    input_ids = batch['input_ids'].cuda()
    attention_mask = batch['attention_mask'].cuda()
    labels = batch['label'].cuda()

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=1)

        total_correct += (predicted_labels == labels).sum().item()
        total_samples += labels.size(0)

accuracy = total_correct / total_samples
print(f"Test Accuracy: {accuracy:.4f}")


Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

Epoch 1/5 - Average Loss: 0.3819
Epoch 2/5 - Average Loss: 0.2847
Epoch 3/5 - Average Loss: 0.2270
Epoch 4/5 - Average Loss: 0.1787
Epoch 5/5 - Average Loss: 0.1449
Test Accuracy: 0.8726


#AI HUB Dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

df=pd.read_csv('/content/drive/MyDrive/감정데이터셋/감정 분류를 위한 대화 음성 데이터셋/4차년도.csv',encoding = 'cp949')
df2 = pd.read_csv('/content/drive/MyDrive/감정데이터셋/감정 분류를 위한 대화 음성 데이터셋/5차년도.csv',encoding = 'cp949')
df3 = pd.read_csv('/content/drive/MyDrive/감정데이터셋/감정 분류를 위한 대화 음성 데이터셋/5차년도_2차.csv',encoding = 'cp949')
df=pd.concat([df,df2,df3],axis = 0)

In [None]:
df['상황']=np.where(df['상황']=='sadness', 'sad',df['상황'])
df['상황'] = np.where(df['상황']=='anger','angry',df['상황'])

In [None]:
le=LabelEncoder()
df['상황'] = le.fit_transform(df['상황'])

In [None]:
new_df=df[['발화문','상황']].copy()
new_df.columns = ['document','label']

In [None]:
from sklearn.model_selection import train_test_split

train,test= train_test_split(new_df,test_size = 0.1, random_state = 77777)

In [None]:
class AIHUBDataset:
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.sentences = self.data['document'].tolist()
        self.labels = self.data['label'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length  
    def __len__(self):
        return len(self.sentences)
    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.labels[idx]
        if pd.isnull(sentence) or pd.isnull(label):
          return self.__getitem__((idx + 1) % len(self))
        encoded_input = self.tokenizer.encode_plus(
            text=sentence,
            add_special_tokens=True,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        input_ids = encoded_input['input_ids'].squeeze()
        attention_mask = encoded_input['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': label
        }        
# 데이터 로드 및 전처리
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
train_dataset = AIHUBDataset(train, tokenizer, max_length=max_seq_length)
test_dataset = AIHUBDataset(test, tokenizer, max_length=max_seq_length)


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [None]:
# 데이터 로더 설정
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# 모델 초기화
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=7)
model.to(device)

# Optimizer 및 스케줄러 설정
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataset) * epochs // batch_size
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)


# 학습
for epoch in range(epochs):
    model.train()
    total_loss = 0

    try :
      for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

      avg_loss = total_loss / len(train_loader)
      print(f"Epoch {epoch+1}/{epochs} - Average Loss: {avg_loss:.4f}")
    except :
      print(batch)
# 테스트
model.eval()
total_correct = 0
total_samples = 0

for batch in test_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=1)

        total_correct += (predicted_labels == labels).sum().item()
        total_samples += labels.size(0)

accuracy = total_correct / total_samples
print(f"Test Accuracy: {accuracy:.4f}")


Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual

Epoch 1/5 - Average Loss: 0.6577
Epoch 2/5 - Average Loss: 0.3025
Epoch 3/5 - Average Loss: 0.2185
Epoch 4/5 - Average Loss: 0.1626
Epoch 5/5 - Average Loss: 0.1231
Test Accuracy: 0.9057


In [None]:
from sklearn.metrics import f1_score

model.eval()
true_labels = []
predicted_labels = []

for batch in test_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted = torch.argmax(logits, dim=1)

    true_labels.extend(labels.tolist())
    predicted_labels.extend(predicted.tolist())

f1 = f1_score(true_labels, predicted_labels, average='macro')
print(f"F1 Score: {f1:.4f}")

F1 Score: 0.8989


In [None]:
sent =  '깜짝 놀라서 뒤지는 줄 알았네'
d=tokenizer.encode_plus(sent, padding='max_length', max_length=max_seq_length, truncation=True, return_tensors='pt')
input_ids = d['input_ids'].to(device)
attention_mask = d['attention_mask'].to(device)
outputs = model(input_ids, attention_mask=attention_mask)
logits = outputs.logits
# Apply softmax to the logits
probs = torch.softmax(logits, dim=1)
predicted_label = torch.argmax(probs, dim=1)

print(predicted_label.item())

1


In [None]:
print(np.array(predicted_label.item()).reshape(-1,1))
le.inverse_transform(np.array(predicted_label.item()).reshape(-1,1))

[[1]]


  y = column_or_1d(y, warn=True)


array(['disgust'], dtype=object)

In [None]:
torch.save(model.state_dict(), f'/content/drive/MyDrive/AIHUB_finetuing_{accuracy:.4f}_{f1:.4f}.pth')