- Configuration

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"

import pandas as pd
import numpy as np
import torch
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, top_k_accuracy_score, accuracy_score, confusion_matrix

from datasets import Features, Value, ClassLabel, Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForMaskedLM
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
import optuna
from optuna.samplers import CmaEsSampler,TPESampler

TEST_RATIO = 0.1 
MASTER_SEED = 42
MODELS = (
    'klue/roberta-base', # 0
    'klue/roberta-large', # 1
    'klue/bert-base', # 2
    'monologg/koelectra-base-v3-discriminator' # 3
    
)
MODEL_ID = 1
MODEL_NAME = MODELS[MODEL_ID]
SENTIMENT_CLASS = ['행복','분노','슬픔','중립']
NUM_LABELS = len(SENTIMENT_CLASS)

model = trainer = None
torch.cuda.is_available()

2024-06-19 17:17:49.418168: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


True

- Load Data, Tokenizer, Model

In [21]:
df = pd.read_csv('../data/sentiment/sentiment_label_df_v2.csv')
df = df[df['labels'].isin(SENTIMENT_CLASS)]

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
data_collator = DataCollatorWithPadding(tokenizer)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
df['labels'].value_counts()

분노    10417
슬픔    10128
중립     7421
행복     7339
Name: labels, dtype: int64

- Prepare data in BERT format

In [6]:
# Define features
emotion_features = Features({'Sentence' : Value(dtype='string'), 'labels' : ClassLabel(names=SENTIMENT_CLASS)})
# Split train and test set
train_df, eval_df = train_test_split(df, test_size = TEST_RATIO, random_state=MASTER_SEED)

# Tokenize dataset
def tokenize(text):
    return tokenizer(text["Sentence"])

train_dataset = Dataset.from_pandas(train_df, features=emotion_features, preserve_index=False).map(tokenize, batched=True)
eval_dataset = Dataset.from_pandas(eval_df, features=emotion_features, preserve_index=False).map(tokenize, batched=True)

Map:   0%|          | 0/31774 [00:00<?, ? examples/s]

Map:   0%|          | 0/3531 [00:00<?, ? examples/s]

- Define a function to compute metrics and declare a trainer instance

In [2]:
def compute_metrics(pred_n_label):
    out, label = pred_n_label
    pred = np.argmax(out, axis=-1)

    # Total Accuracy
    acc = accuracy_score(label, pred)
    # Top 2 Accuracy
    top_k_acc = top_k_accuracy_score(label, out)
    # F1 score (macro)
    f1 = f1_score(label, pred, average="macro")
    # Per-class Accuracy
    cm = confusion_matrix(label, pred)
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    cls_acc = []
    for cl, acc_per_cls in zip(SENTIMENT_CLASS, cm.diagonal()):
        cls_acc.append(acc_per_cls)

    return {
        'accuracy':acc, 'f1':f1,
        'top_k_accuracy':top_k_acc,
        **dict(zip(SENTIMENT_CLASS, cls_acc))
        } 

### Fine Tuning with Pre-training model

In [8]:
args = TrainingArguments(
    output_dir='../Sentiment_Analysis/output/' + MODEL_NAME.split('/')[1] + '_' + datetime.now().strftime("%Y%m%d_%H%M%S"),
    do_train=True,
    evaluation_strategy='epoch',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=128,
    learning_rate=1e-5,
    weight_decay=0.1,
    adam_beta1=0.9,
    adam_beta2=0.98,
    adam_epsilon=1e-6,
    num_train_epochs=5,
    warmup_ratio=0.06,
    save_strategy='epoch',
    seed=MASTER_SEED,
    data_seed=MASTER_SEED,
    remove_unused_columns=True,
    )



- BERT-base
    - bert-base_20240619_115423 / checkpoint-9930

In [7]:
trainer = Trainer(model=model, 
                  tokenizer=tokenizer, 
                  args=args, 
                  data_collator=data_collator, 
                  train_dataset=train_dataset, 
                  eval_dataset=eval_dataset, 
                  compute_metrics=compute_metrics)
trainer.train()
trainer.evaluate()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Top K Accuracy,행복,분노,슬픔,중립
1,0.3137,0.283246,0.885868,0.898096,0.985273,0.931526,0.83112,0.832502,0.988357
2,0.2413,0.26907,0.898046,0.907358,0.985556,0.952924,0.8463,0.838485,0.996119
3,0.196,0.302093,0.897479,0.907378,0.989238,0.941512,0.843454,0.854437,0.987063
4,0.1425,0.335035,0.903144,0.912588,0.988105,0.948645,0.85389,0.851446,0.996119
5,0.1179,0.389441,0.90286,0.91219,0.986973,0.951498,0.842505,0.860419,0.996119




{'eval_loss': 0.3894410729408264,
 'eval_accuracy': 0.9028603794958935,
 'eval_f1': 0.9121904075828369,
 'eval_top_k_accuracy': 0.9869725290286038,
 'eval_행복': 0.9514978601997147,
 'eval_분노': 0.8425047438330171,
 'eval_슬픔': 0.8604187437686939,
 'eval_중립': 0.9961190168175937,
 'eval_runtime': 4.1276,
 'eval_samples_per_second': 855.458,
 'eval_steps_per_second': 1.696,
 'epoch': 5.0}

In [6]:
trainer = Trainer(model=model, 
                  tokenizer=tokenizer, 
                  args=args, 
                  data_collator=data_collator, 
                  train_dataset=train_dataset, 
                  eval_dataset=eval_dataset, 
                  compute_metrics=compute_metrics)
trainer.train()
trainer.evaluate()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Top K Accuracy,행복,분노,슬픔,중립
1,0.3341,0.320022,0.88785,0.899955,0.985556,0.927247,0.829222,0.841476,0.992238
2,0.261,0.302943,0.900312,0.909663,0.986973,0.944365,0.851992,0.847458,0.994825
3,0.2308,0.402959,0.901728,0.91124,0.988389,0.944365,0.837761,0.869392,0.992238
4,0.196,0.45751,0.904843,0.913506,0.986406,0.942939,0.849146,0.865404,0.997413
5,0.117,0.512838,0.904843,0.91372,0.986689,0.947218,0.854839,0.856431,0.997413




{'eval_loss': 0.512837827205658,
 'eval_accuracy': 0.9048428207306712,
 'eval_f1': 0.9137204108794539,
 'eval_top_k_accuracy': 0.9866893231379212,
 'eval_행복': 0.9472182596291013,
 'eval_분노': 0.8548387096774194,
 'eval_슬픔': 0.8564307078763709,
 'eval_중립': 0.9974126778783958,
 'eval_runtime': 6.0728,
 'eval_samples_per_second': 581.442,
 'eval_steps_per_second': 2.305,
 'epoch': 5.0}

- Koelectra
    - koelectra-base-v3-discriminator_20240618_141343/checkpoint-19860

In [20]:
trainer = Trainer(model=model, 
                  tokenizer=tokenizer, 
                  args=args, 
                  data_collator=data_collator, 
                  train_dataset=train_dataset, 
                  eval_dataset=eval_dataset, 
                  compute_metrics=compute_metrics)
trainer.train()
trainer.evaluate()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Top K Accuracy,행복,분노,슬픔,중립
1,0.3403,0.340684,0.881337,0.893789,0.983008,0.921541,0.8074,0.852443,0.983182
2,0.2778,0.301789,0.900595,0.910317,0.988389,0.954351,0.836812,0.858425,0.993532
3,0.2546,0.380444,0.901728,0.911647,0.987539,0.945792,0.875712,0.828514,0.992238
4,0.2326,0.428549,0.90371,0.912856,0.987822,0.951498,0.829222,0.878365,0.994825
5,0.1492,0.471694,0.902577,0.912333,0.987539,0.951498,0.839658,0.86341,0.994825




{'eval_loss': 0.4716937839984894,
 'eval_accuracy': 0.902577173605211,
 'eval_f1': 0.9123329793448183,
 'eval_top_k_accuracy': 0.9875389408099688,
 'eval_행복': 0.9514978601997147,
 'eval_분노': 0.8396584440227703,
 'eval_슬픔': 0.8634097706879362,
 'eval_중립': 0.9948253557567918,
 'eval_runtime': 6.3286,
 'eval_samples_per_second': 557.947,
 'eval_steps_per_second': 2.212,
 'epoch': 5.0}

- RoBERTa-base
    - roberta-base_20240619_103559 / checkpoint-9930

In [18]:
trainer = Trainer(model=model, 
                  tokenizer=tokenizer, 
                  args=args, 
                  data_collator=data_collator, 
                  train_dataset=train_dataset, 
                  eval_dataset=eval_dataset, 
                  compute_metrics=compute_metrics)
trainer.train()
trainer.evaluate()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Top K Accuracy,행복,분노,슬픔,중립
1,0.3158,0.294761,0.890116,0.900991,0.985273,0.928673,0.857685,0.831505,0.97542
2,0.2499,0.285452,0.895214,0.905874,0.990088,0.955777,0.781784,0.898305,0.990944
3,0.2003,0.306372,0.902577,0.911655,0.987539,0.928673,0.885199,0.832502,0.993532
4,0.1525,0.38618,0.901444,0.910688,0.988389,0.947218,0.86148,0.839482,0.994825
5,0.119,0.429234,0.900595,0.909847,0.988105,0.944365,0.834915,0.870389,0.989651




{'eval_loss': 0.4292335510253906,
 'eval_accuracy': 0.9005947323704333,
 'eval_f1': 0.9098473969503202,
 'eval_top_k_accuracy': 0.9881053525913339,
 'eval_행복': 0.9443651925820257,
 'eval_분노': 0.8349146110056926,
 'eval_슬픔': 0.8703888334995015,
 'eval_중립': 0.9896507115135834,
 'eval_runtime': 4.0881,
 'eval_samples_per_second': 863.73,
 'eval_steps_per_second': 1.712,
 'epoch': 5.0}

- RoBERTa-large
    - roberta-large_20240417_122910 / checkpoint-9930

In [28]:
trainer = Trainer(model=model, 
                  tokenizer=tokenizer, 
                  args=args, 
                  data_collator=data_collator, 
                  train_dataset=train_dataset, 
                  eval_dataset=eval_dataset, 
                  compute_metrics=compute_metrics)
trainer.train()
trainer.evaluate()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Top K Accuracy,행복,분노,슬픔,중립
1,0.3035,0.301868,0.888983,0.898662,0.986689,0.961484,0.910816,0.732802,0.996119
2,0.2386,0.272995,0.909657,0.918584,0.989238,0.951498,0.85389,0.874377,0.993532
3,0.1515,0.365211,0.908241,0.917381,0.989238,0.952924,0.835863,0.886341,0.994825
4,0.1003,0.429691,0.909941,0.918793,0.989805,0.952924,0.851044,0.877368,0.993532
5,0.0697,0.515261,0.909941,0.918737,0.987539,0.952924,0.858634,0.869392,0.993532




{'eval_loss': 0.5152614116668701,
 'eval_accuracy': 0.9099405267629567,
 'eval_f1': 0.9187373072319894,
 'eval_top_k_accuracy': 0.9875389408099688,
 'eval_행복': 0.9529243937232525,
 'eval_분노': 0.8586337760910816,
 'eval_슬픔': 0.8693918245264207,
 'eval_중립': 0.9935316946959897,
 'eval_runtime': 10.3687,
 'eval_samples_per_second': 340.545,
 'eval_steps_per_second': 0.675,
 'epoch': 5.0}

### Optuna parameter tuning

In [7]:
def objective(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 1e-8, 1e-5, step=1e-8)
    }
    args = TrainingArguments(
        output_dir='../Sentiment_Analysis/tuning_output/' + MODEL_NAME.split('/')[1] + '_' + datetime.now().strftime("%Y%m%d_%H%M%S"),
        do_train=True,
        evaluation_strategy='epoch',
        per_device_train_batch_size=4,
        per_device_eval_batch_size=128,
        num_train_epochs=5,
        learning_rate=params['learning_rate'],
        weight_decay=0.1,
        adam_beta1=0.9,
        adam_beta2=0.98,
        adam_epsilon=1e-6,
        warmup_ratio=0.06,
        save_strategy='epoch',
        seed=MASTER_SEED,
        data_seed=MASTER_SEED,
        remove_unused_columns=True,
    )
    trainer = Trainer(model=model, tokenizer=tokenizer, args=args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics)
    trainer.train()
    result = trainer.evaluate()
    return result['eval_accuracy'] 

- Parameter Tuning - TPES Sampler(범주형, 실수형 변수 적절)

In [None]:
# Running Optuna Optimization
num_trials = 10
sampler = TPESampler(seed=MASTER_SEED)
study = optuna.create_study(direction='maximize', sampler=sampler) 
study.optimize(objective, n_trials=num_trials)

# Get the best hyperparameters
best_params = study.best_params
print("Best Parameters : ", best_params)

# Save Best Parameters for text file
file_path = '../Sentiment_Analysis/tuning_output/' + MODEL_NAME.split('/')[1] + '_' + 'tpes_optuna_best_params_' + datetime.now().strftime("%Y%m%d_%H%M%S") + 'txt',
with open(file_path, 'w') as file:
    file.write("Best Parameters:\n")
    for key, value in best_params.items():
        file.write(f"{key}: {value}\n")

- Best Params으로 Train
    - finetune_roberta-large_20240417_173155/ checkpoint-9930

In [22]:
training_args = TrainingArguments(
    output_dir='../Sentiment_Analysis/tuning_output/finetune_' + MODEL_NAME.split('/')[1] + '_' + datetime.now().strftime("%Y%m%d_%H%M%S"),
    do_train=True,
    evaluation_strategy='epoch',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=128,
    num_train_epochs=5,
    learning_rate=best_params['learning_rate'],
    weight_decay=0.1,
    adam_beta1 = 0.9,
    adam_beta2 = 0.98,
    adam_epsilon = 1e-6,
    warmup_ratio=0.06,
    save_strategy='epoch',
    seed=MASTER_SEED,
    data_seed=MASTER_SEED,
    remove_unused_columns=True,
)
trainer = Trainer(model=model, tokenizer=tokenizer, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics)
trainer.train()
trainer.evaluate()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Top K Accuracy,행복,분노,슬픔,중립
1,0.0,1.482681,0.90371,0.912995,0.964599,0.951498,0.844402,0.865404,0.990944
2,0.0,1.480979,0.903993,0.91337,0.964033,0.951498,0.845351,0.865404,0.990944
3,0.0,1.480698,0.90371,0.913126,0.964033,0.951498,0.845351,0.864407,0.990944
4,0.0,1.48052,0.903993,0.913298,0.964316,0.951498,0.848197,0.862413,0.990944
5,0.0,1.482241,0.90371,0.913059,0.96375,0.951498,0.845351,0.864407,0.990944




{'eval_loss': 1.4822412729263306,
 'eval_accuracy': 0.9037099971679411,
 'eval_f1': 0.9130585861213216,
 'eval_top_k_accuracy': 0.9637496459926367,
 'eval_행복': 0.9514978601997147,
 'eval_분노': 0.8453510436432637,
 'eval_슬픔': 0.864406779661017,
 'eval_중립': 0.9909443725743855,
 'eval_runtime': 3.715,
 'eval_samples_per_second': 950.481,
 'eval_steps_per_second': 1.884,
 'epoch': 5.0}

### Fine Tuning with Further Pre-training model

In [27]:
# Load Youtube Label Data
youtube_label_df = pd.read_csv('../data/youtube_process/label_youtube_df.csv')[['clean_text','clean_sentiment']][:5000]
youtube_label_df.rename(columns={'clean_text':'Sentence', 'clean_sentiment':'labels'},inplace=True)
aihub_youtube_df = pd.concat([df,youtube_label_df])
aihub_youtube_df['labels'].value_counts()

분노    12487
슬픔    12220
행복     7895
중립     7703
Name: labels, dtype: int64

In [28]:
# load best pretraining model(roberta-large-tpes_tuning)
pretrain_model_path = '../Sentiment_Analysis/tuning_output/finetune_roberta-large_20240417_173155/checkpoint-9930'
model = AutoModelForSequenceClassification.from_pretrained(pretrain_model_path,device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(pretrain_model_path)
data_collator = DataCollatorWithPadding(tokenizer)

In [29]:
# Define features
emotion_features = Features({'Sentence' : Value(dtype='string'), 'labels' : ClassLabel(names=SENTIMENT_CLASS)})
# Split train and test set
train_df, eval_df = train_test_split(aihub_youtube_df, test_size = TEST_RATIO, random_state=MASTER_SEED)
# Tokenize dataset
def tokenize(text):
    return tokenizer(text["Sentence"])
train_dataset = Dataset.from_pandas(train_df, features=emotion_features, preserve_index=False).map(tokenize, batched=True)
eval_dataset = Dataset.from_pandas(eval_df, features=emotion_features, preserve_index=False).map(tokenize, batched=True)

Map:   0%|          | 0/36274 [00:00<?, ? examples/s]

Map:   0%|          | 0/4031 [00:00<?, ? examples/s]

In [30]:
args = TrainingArguments(
    output_dir='../Sentiment_Analysis/output/further_pretrain_robert-large_' + datetime.now().strftime("%Y%m%d_%H%M%S"),
    do_train=True,
    evaluation_strategy='epoch',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=128,
    learning_rate=1e-5,
    weight_decay=0.1,
    adam_beta1=0.9,
    adam_beta2=0.98,
    adam_epsilon=1e-6,
    num_train_epochs=5,
    warmup_ratio=0.06,
    save_strategy='epoch',
    seed=MASTER_SEED,
    data_seed=MASTER_SEED,
    remove_unused_columns=True,
    )



In [31]:
trainer = Trainer(model=model, 
                  tokenizer=tokenizer, 
                  args=args, 
                  data_collator=data_collator, 
                  train_dataset=train_dataset, 
                  eval_dataset=eval_dataset, 
                  compute_metrics=compute_metrics)
trainer.train()
trainer.evaluate()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Top K Accuracy,행복,분노,슬픔,중립
1,0.3808,0.515233,0.876209,0.89284,0.975936,0.908854,0.91875,0.756275,0.967914
2,0.3065,0.499468,0.903746,0.916485,0.978913,0.916667,0.923438,0.836437,0.967914
3,0.2048,0.55282,0.905234,0.916734,0.978417,0.919271,0.897656,0.861538,0.975936
4,0.1337,0.682484,0.909452,0.919912,0.976681,0.932292,0.895312,0.869636,0.975936
5,0.0348,0.78242,0.913173,0.922544,0.974944,0.928385,0.894531,0.882591,0.979947


{'eval_loss': 0.78242027759552,
 'eval_accuracy': 0.9131729099479038,
 'eval_f1': 0.9225442614054467,
 'eval_top_k_accuracy': 0.9749441825849665,
 'eval_행복': 0.9283854166666666,
 'eval_분노': 0.89453125,
 'eval_슬픔': 0.8825910931174089,
 'eval_중립': 0.9799465240641712,
 'eval_runtime': 45.8745,
 'eval_samples_per_second': 87.87,
 'eval_steps_per_second': 0.698,
 'epoch': 5.0}