In [None]:
!pip install transformers[sentencepiece]

In [None]:
!pip install datasets

In [None]:
import numpy as np
import pandas as pd 
import re

import torch
from torch.utils.data import DataLoader

from tqdm.auto import tqdm

from transformers import AdamW, AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, get_scheduler, TrainingArguments, Trainer
from datasets import Dataset

from sklearn.model_selection import KFold

from sklearn.metrics import f1_score, precision_score, recall_score

In [None]:
data_2 = pd.read_csv('two_class.csv')
data_3 = pd.read_csv('three_class.csv')

In [None]:
data_2['emo'] = data_2['emotion'].map({-1:0, 1:1})
data_3['emo'] = data_3['emotion'].map({-1:0, 0:1, 1:2})

data_2 = data_2[['context', 'emo']].rename(columns = {'emo': 'labels', 'context': 'text'})
data_3 = data_3[['context', 'emo']].rename(columns = {'emo': 'labels', 'context': 'text'})

In [None]:
data_2.head()

Unnamed: 0,text,labels
0,肖战 期待 冬奥 赛场 抹 中国 红 加油 加油,1
1,北京 冬奥会 闭幕式 期待 下次 冬奥,1
2,冬奥 开幕式 骂 偷国 选手 想 美 疫情 思考 热带 国家 震撼 中国 魂 狠...,0
3,今年冬天 恨不能 国内 想 环球 影城 想 yyqx 电影 更想 冬奥 疫...,0
4,冬奥 黑 那下 一届 米兰 极有 牌 不到,0


# Binary Classification

In [None]:
kf = KFold(n_splits=5, shuffle=False)

use_data = {}

num = 0
for train_index, test_index in kf.split(data_2):
    trainset = Dataset.from_pandas(data_2.iloc[train_index])
    testset = Dataset.from_pandas(data_2.iloc[test_index])
    use_data[num] = [trainset, testset]
    num +=1



In [None]:
train_index

In [None]:
checkpoint = "bert-base-chinese"  # the model has no maximum length parameter to pad with

tokenizer = AutoTokenizer.from_pretrained(checkpoint, model_max_length=512)

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)


tokenized_trainset = use_data[4][0].map(tokenize_function, batched=True)
tokenized_devset = use_data[4][1].map(tokenize_function, batched=True)


tokenized_trainset = tokenized_trainset.remove_columns(["text"])
tokenized_devset = tokenized_devset.remove_columns(["text"])
tokenized_trainset = tokenized_trainset.remove_columns(["__index_level_0__"])
tokenized_devset = tokenized_devset.remove_columns(["__index_level_0__"])

In [None]:
tokenized_trainset

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1859
})

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
args = TrainingArguments(
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    num_train_epochs=3,
    learning_rate=3e-6,
    report_to="none",
    output_dir='bert_1',
    logging_steps = 200
)


trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_trainset,
    eval_dataset=tokenized_devset,
    tokenizer=tokenizer
)

In [None]:
trainer.train()

In [None]:
acc_lst = []
f1_m_lst = []

In [None]:
# Evaluation

predictions = trainer.predict(tokenized_devset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids
print('acc:', np.sum(y_true == y_pred) / len(y_true))
print("f1_score", f1_score(y_true, y_pred))
print("f1_score_macro", f1_score(y_true, y_pred,average='macro'))

acc_lst.append(np.sum(y_true == y_pred) / len(y_true))
f1_m_lst.append(f1_score(y_true, y_pred))


***** Running Prediction *****
  Num examples = 464
  Batch size = 8


acc: 0.9202586206896551
f1_score 0.9467625899280576
f1_score_macro 0.8939821533331276


In [None]:
print(acc_lst)
print(f1_m_lst)

In [None]:
print(np.mean(acc_lst))
print(np.mean(f1_m_lst))

# 3-class Classification

In [None]:
kf = KFold(n_splits=5, shuffle=False)

use_data = {}

num = 0
for train_index, test_index in kf.split(data_3):
    trainset = Dataset.from_pandas(data_3.iloc[train_index])
    testset = Dataset.from_pandas(data_3.iloc[test_index])
    use_data[num] = [trainset, testset]
    num +=1


In [None]:
checkpoint = "bert-base-chinese"  # the model has no maximum length parameter to pad with

tokenizer = AutoTokenizer.from_pretrained(checkpoint, model_max_length=512)

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)


tokenized_trainset = use_data[4][0].map(tokenize_function, batched=True)
tokenized_devset = use_data[4][1].map(tokenize_function, batched=True)


tokenized_trainset = tokenized_trainset.remove_columns(["text"])
tokenized_devset = tokenized_devset.remove_columns(["text"])
tokenized_trainset = tokenized_trainset.remove_columns(["__index_level_0__"])
tokenized_devset = tokenized_devset.remove_columns(["__index_level_0__"])

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

In [None]:
args = TrainingArguments(
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    num_train_epochs=3,
    learning_rate=3e-6,
    report_to="none",
    output_dir='bert_1',
    logging_steps = 200
)


trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_trainset,
    eval_dataset=tokenized_devset,
    tokenizer=tokenizer
)

In [None]:
trainer.train()

In [None]:
acc_lst = []
f1_m_lst = []

In [None]:
# Evaluation

predictions = trainer.predict(tokenized_devset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids
print('acc:', np.sum(y_true == y_pred) / len(y_true))
print("f1_score_macro", f1_score(y_true, y_pred,average='macro'))

acc_lst.append(np.sum(y_true == y_pred) / len(y_true))
f1_m_lst.append(f1_score(y_true, y_pred,average='macro'))


***** Running Prediction *****
  Num examples = 679
  Batch size = 8


acc: 0.6907216494845361
f1_score_macro 0.6716932295367939


In [None]:
print(acc_lst)
print(f1_m_lst)

[0.7073529411764706, 0.6539027982326951, 0.695139911634757, 0.6980854197349042, 0.6907216494845361]
[0.6588227666170581, 0.6095126131479902, 0.6583142382760717, 0.6428677549622415, 0.6716932295367939]


In [None]:
print(np.mean(acc_lst))
print(np.mean(f1_m_lst))

0.6890405440526728
0.6482421205080311
