# 使用中文的資料集做fine-tune

In [None]:

import pandas as pd
from simpletransformers.classification import MultiLabelClassificationModel , ClassificationModel
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split


# 載入數據
ch_train_df = pd.read_csv('ch_train.csv')
test_df = pd.read_csv('ch_test.csv')

# train數據整理，並調整fine-tube時的資料筆數
train = pd.concat([
    ch_train_df[['comment_text', 'toxic']].query('toxic==0'),
    ch_train_df[['comment_text', 'toxic']].query('toxic==1').sample(n=1300, random_state=0,replace = True)
])


#test數據整理
test_df['label'] = list(zip(test_df.toxic.tolist()))
test_df['text'] = test_df['comment_text'].apply(lambda x: x.replace('\n', ' '))
eval_df = test_df[['text','label']]
# In[ ]:

#設定訓練參數
train_arg = {'learning_rate':3e-5,
             'gradient_accumulation_steps':16,
             'train_batch_size':2,
             'num_train_epochs':3, 
             'max_seq_length': 328,
             'reprocess_input_data': True, 
             'overwrite_output_dir': True,
             "evaluate_during_training": False,
           }


#因為輸出時不為整數0或1，所以把預測答案設為大於0.4的值為1，並預測準確率   

def f1_multiclass(labels, preds): 
    preds = (preds>0.4)    
    return f1_score(labels, preds, average='micro')


#設定使用模型與模型參數並開始FINE-TUNE模型

TCDM = MultiLabelClassificationModel('xlmroberta', 'xlm-roberta-base',use_cuda=True,args = train_arg,num_labels=1)

TCDM.train_model(train_df)


#OUTPUT結果，result是準確率，model_outputs是模型預測答案，wrong_predictions是顯示預測錯誤答案為哪幾個
result, model_outputs, wrong_predictions = TCDM.eval_model(eval_df, f1=f1_multiclass)
print(result)

# 使用中英文資料集做fine-tune

In [None]:


import pandas as pd
from simpletransformers.classification import MultiLabelClassificationModel
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split


#載入英文資料集
train1 = pd.read_csv("jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")



#載入中文資料集與測試集
ch_train_df = pd.read_csv('ch_train.csv')
test_df = pd.read_csv('ch_test.csv')



#結合中文與英文資料集，n為可以調整輸入時的資料量
train = pd.concat([
    train1[['comment_text', 'toxic']].query('toxic==1').sample(n=1331, random_state=0),
    train1[['comment_text', 'toxic']].query('toxic==0').sample(n=1331, random_state=0),
    ch_train_df[['comment_text', 'toxic']].query('toxic==0'),
    ch_train_df[['comment_text', 'toxic']].query('toxic==1').sample(n=1331, random_state=0,replace = True)
])


#整理測試資料集
test_df['label'] = list(zip(test_df.toxic.tolist()))
test_df['text'] = test_df['comment_text'].apply(lambda x: x.replace('\n', ' '))
test_df = test_df[['text','label']]


#設定模型的訓練參數
train_arg = {'learning_rate':4e-5,
             'gradient_accumulation_steps':16,
             'train_batch_size':4,
             'num_train_epochs':3, 
             'max_seq_length': 328,
             'reprocess_input_data': True, 
             'overwrite_output_dir': True
           }


#因為輸出時不為整數0或1，所以把預測答案設為大於0.4的值為1，並預測準確率   

def f1_multiclass(labels, preds):
    preds = (preds>0.4)  
    return f1_score(labels, preds, average='micro')

#設定使用模型與模型參數並開始FINE-TUNE模型
model = MultiLabelClassificationModel('xlmroberta', 'xlm-roberta-base',use_cuda=True,args = train_arg,num_labels=1)

model.train_model(train)


#OUTPUT結果，result是準確率，model_outputs是模型預測答案，wrong_predictions是顯示預測錯誤答案為哪幾個
result, model_outputs, wrong_predictions = model.eval_model(test_df, f1=f1_multiclass)
print(result)




# 使用英文資料集做fine-tune

In [None]:

import pandas as pd
from simpletransformers.classification import MultiLabelClassificationModel
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split


#載入英文訓練資料與中文測試資料
train1 = pd.read_csv("jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
test_df = pd.read_csv('ch_test.csv')

#整理訓練數據，n為可以調整訓練資料的筆數
train = pd.concat([
    train1[['comment_text', 'toxic']].query('toxic==1').sample(n=100000, random_state=0),
    train1[['comment_text', 'toxic']].query('toxic==0').sample(n=5000, random_state=0)
])


#中文測試資料的整理
test_df['label'] = list(zip(test_df.toxic.tolist()))
test_df['text'] = test_df['comment_text'].apply(lambda x: x.replace('\n', ' '))
test_df = test_df[['text','label']]


#設定模型的訓練參數
train_arg = {'learning_rate':4e-5,
             'gradient_accumulation_steps':16,
             'train_batch_size':4,
             'num_train_epochs':3, 
             'max_seq_length': 328,
             'reprocess_input_data': True, 
             'overwrite_output_dir': True
           }

#因為輸出時不為整數0或1，所以把預測答案設為大於0.4的值為1，並預測準確率  

def f1_multiclass(labels, preds):
    preds = (preds>0.4)  
    return f1_score(labels, preds, average='micro')


#設定使用模型與模型參數並開始FINE-TUNE模型
model = MultiLabelClassificationModel('xlmroberta', 'xlm-roberta-base',use_cuda=True,args = train_arg,num_labels=1)

model.train_model(train)


#OUTPUT結果，result是準確率，model_outputs是模型預測答案，wrong_predictions是顯示預測錯誤答案為哪幾個
result, model_outputs, wrong_predictions = model.eval_model(test_df, f1=f1_multiclass)
print(result)




# 可以把上面模型的result來劃出confusion matrix

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

#整理真實值
test_df = pd.read_csv('ch_test.csv')
test_df['label'] = test_df['toxic']
y = np.array(test_df['label'])
y_true = y[:, np.newaxis]


#整理預測值
model_output = model_outputs>0.1

#輸出confusion matrix
cf = confusion_matrix(y_true=y_true, y_pred=model_output+0)

#劃出confusion matrix
fig, ax = plt.subplots(figsize=(7, 7))
ax.matshow(cf, cmap=plt.cm.Blues, alpha=0.3)
for i in range(cf.shape[0]):
    for j in range(cf.shape[1]):
        ax.text(x=j, y=i, s=cf[i,j], va='center', ha='center', fontsize =30)
plt.xlabel('predicted label', fontsize =30)        
plt.ylabel('true label', fontsize =30)
plt.xticks(fontsize=30)
plt.yticks(fontsize=30)
plt.show()