# Fine-tune BERT

This notebook fine-tunes a BERT model using 2000 human labeled comments. Then, we used this fine-tuned BERT to get labels for all comments

In [None]:
import pandas as pd 
import numpy as np 
# sklearn machine learning and measurements packages
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef, confusion_matrix

# deep learning packages for bert
import torch
# !pip install -q transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertModel, TrainingArguments, Trainer, pipeline
# !pip install datasets
from datasets import load_metric, load_dataset

# visualization and standard library packages
import matplotlib.pyplot as plt
import seaborn as sns
import csv
from tqdm import tqdm
from collections import Counter
import os

%matplotlib inline

RANDOM_STATE = 42

In [None]:
# change to your local directory if necessary

In [None]:
# !nvidia-smi

## Split train, eval, test set

The two data samples are first sampled from comments then labeled by two individual labelers.

Map:
  * 0: negative
  * 1: neutral
  * 2: positive

In [None]:
data_df1 = pd.read_excel(f"{os.getcwd()}/labeled_comments/data_anno1.xlsx")
data_df1 = data_df1.drop(data_df1.columns[[0, 1]], axis = 1)
data_df1['human_label'] = data_df1['human_label'] + 1 

data_df2 = pd.read_excel(f"{os.getcwd()}/labeled_comments/data_anno2.xlsx")
data_df2 = data_df2.drop(data_df2.columns[[0, 1]], axis = 1)
data_df2['human_label'] = data_df2['human_label'] +1 

In [None]:
Counter(data_df1['human_label']), Counter(data_df2['human_label'])

**check comment length**

In [None]:
def check_length(comments):
  len_128 = 0
  len_256 = 0
  for _, comment in tqdm(comments.iterrows()):
    try: 
      len_comment = len(comment['content'])
    except TypeError:
      len_comment = len(str(comment['content']))

    if len_comment > 128:
      len_128 += 1
    if len_comment > 256:
      len_256 += 1 
  return len_128, len_256

In [None]:
check_length(data_df1), check_length(data_df2)

Only 2 comments have more than 256 tokens, so we use a max length of 256 for tokenize comments

In [None]:
def get_train_eval_test(example_df1, example_df2):
  '''
  train : eval : test = 6 : 2 : 2
  '''
  def split_data (example_df):
    train_eval_idx, test_idx = train_test_split(example_df.index, 
                                        test_size=0.2, 
                                        random_state=RANDOM_STATE)
    train_eval_comments = example_df.iloc[train_eval_idx, :].reset_index(drop=True)
    test_comments = example_df.iloc[test_idx, :].reset_index(drop=True)

    train_idx, eval_idx = train_test_split(train_eval_comments.index, 
                                                  test_size=0.25, 
                                                  random_state=RANDOM_STATE)
    train_comments = train_eval_comments.iloc[train_idx, :].reset_index(drop=True)
    eval_comments = train_eval_comments.iloc[eval_idx, :].reset_index(drop=True)

    return train_comments, eval_comments, test_comments
  
  # we don't use to_csv() here is because simply use to_csv() would yield 
  # wrongly formatted csv files.
  def write_csv(example_df, file_dir):
    '''
    write example DataFrame into the given file directory 
    '''
    with open(file_dir, 'w') as f:
      writer = csv.writer(f, delimiter=',')
      # write column names 
      writer.writerow(('index',
                      'comment_time',
                      'movie_id',
                      'human_label',
                      'content'))
      for idx, row in tqdm(example_df.iterrows()):
              writer.writerow((idx,
                              row['comment_time'],
                              row['movie_id'],
                              row['human_label'],
                              row['content']))

  train_comments_1, eval_comments_1, test_comments_1 = split_data(example_df1)
  train_comments_2, eval_comments_2, test_comments_2 = split_data(example_df2)
  
  train_set = pd.concat([train_comments_1, train_comments_2]).reset_index(drop=True)
  eval_set = pd.concat([eval_comments_1, eval_comments_2]).reset_index(drop=True)
  test_set = pd.concat([test_comments_1, test_comments_2]).reset_index(drop=True)
  file_train = f'{os.getcwd()}/labeled_comments/train_data.csv'
  file_eval = f'{os.getcwd()}/labeled_comments/eval_data.csv'
  file_test = f'{os.getcwd()}/labeled_comments/test_data.csv'
  write_csv(train_set, file_train)
  write_csv(eval_set, file_eval)
  write_csv(test_set, file_test)

In [None]:
get_train_eval_test(data_df1, data_df2)

## Fine-tune

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-chinese", num_labels=3)

In [None]:
# max length is decided according to previous examination
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", max_length=256, truncation=True)

In [None]:
dataset = load_dataset('csv', data_files={'train': f'{os.getcwd()}/labeled_comments/train_data.csv',
                                          'test': f'{os.getcwd()}/labeled_comments/eval_data.csv',})
dataset = dataset.rename_column("content", "text")
dataset = dataset.rename_column("human_label", "label")
dataset = dataset.remove_columns(['index', 'comment_time','movie_id'])
dataset

In [None]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
train_dataset = tokenized_datasets["train"].shuffle(seed=RANDOM_STATE)
eval_dataset = tokenized_datasets["test"].shuffle(seed=RANDOM_STATE)

In [None]:
training_args = TrainingArguments(output_dir=f"{os.getcwd()}/output_bert", 
                                  num_train_epochs=4,
                                  per_device_train_batch_size=8,
                                  per_device_eval_batch_size=8,
                                  save_strategy = "epoch",
                                  # save_steps = 10000,
                                  evaluation_strategy="epoch")
metric = load_metric("accuracy")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
model_dir = f"{os.getcwd()}/output_bert/model"
model.save_pretrained(model_dir)

## Evaluation

In [None]:
model_dir = f"{os.getcwd()}/output_bert/model"
model = AutoModelForSequenceClassification.from_pretrained(model_dir, num_labels=3)
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

In [None]:
#set the model to cpu mode for evaluation
model = model.cpu()
text_classification = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

In [None]:
def sentiment_analysis(example_df):
  '''
  Using fine-tuned bert  model to get 
  the sentiment label

  Inputs:
    test_comments:DataFrame

  Outputs:
    sentiment_label: list of sentiment labels
  '''
  sentiment_label = []
  scores = []
  # This operation would take a very long time, 
  # so we use tqdm to reduce our anxiery :)
  for comment in tqdm(example_df['content']):
      sent = text_classification(comment)[0]
      if sent['label'] == 'LABEL_0':
        sentiment_label.append(0)
      elif sent['label'] == 'LABEL_1':
        sentiment_label.append(1)
      else:
        sentiment_label.append(2)
      scores.append(sent['score'])
  return sentiment_label, scores

In [None]:
test_comments = pd.read_csv(f'{os.getcwd()}/labeled_comments/test_data.csv')
y_label, scores = sentiment_analysis(test_comments)

In [None]:
test_comments['pred_label'] = y_label
test_comments['pred_score'] = scores

In [None]:
con_matrix_bert = confusion_matrix(test_comments['human_label'], y_label)
fig, ax = plt.subplots()
sns.heatmap(con_matrix_bert, annot=True, fmt='d',
            xticklabels=['negative', 'neutral', 'positive'], 
            yticklabels=['negative', 'neutral', 'positive'])
plt.xlabel('predicted label')
plt.ylabel('human label')
plt.title("Fine-tuned BERT Confusion Matrix")
fig.savefig(f'{os.getcwd()}/figs/confusion_matrix.png', dpi=600)

**Matthews correlation**

According to [sklearn's introduction](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.matthews_corrcoef.html), Matthews correlation coefficient could be used as a measure of the quality of classification even if the dataset is unbalanced. It ranges from -1 to 1, where 1 means perfect prediction.

Equation: 
$$ 
    MCC = \frac{TP*TN -FP*FN}{\sqrt{(TP + FP)(TP + FN)(TN + FP)(TN + FN)}}
$$

In [None]:
matthews_corrcoef(test_comments['human_label'], y_label)

## Label all comments

In [None]:
comments = pd.read_csv(f'{os.getcwd()}/comments.csv', dtype={'CONTENT': str})

In [None]:
comments = comments.loc[:, ['COMMENT_TIME','COMMENT_ID','MOVIE_ID', 'USER_MD5','CONTENT', 'VOTES', 'RATING']]
comments = comments.rename(columns={'COMMENT_ID':'comment_id',
                                    'COMMENT_TIME':'comment_time',
                                    'MOVIE_ID':'movie_id',
                                    'RATING':'rating',
                                    'CONTENT':'content',
                                   'USER_MD5':'user_md5',
                                   'VOTES':'votes'})

In [None]:
# check empty content
empty_lst = []
error_lst = []
for index, row in tqdm(comments.iterrows()):
    try:
        if not row['content'].strip():
            empty_lst.append(index)
    except AttributeError:
        error_lst.append(index)

comments.drop(empty_lst + error_lst, axis=0, inplace=True)

In [None]:
# drop super long comments
long_lst = []
for index, row in tqdm(comments.iterrows()):
    if len(row['content'].strip()) >=512:
        long_lst.append(index)
comments.drop(long_lst, axis=0, inplace=True)

In [None]:
comments = comments.reset_index(drop=True)
y_label_all, _ = sentiment_analysis(comments)
comments['pred_label'] = y_label_all

In [None]:
# we don't use to_csv() here is because simply use to_csv() would yield 
# wrongly formatted csv files.
def write_csv_2(example_df, file_dir):
  '''
  write example DataFrame into the given file directory 
  '''
  with open(file_dir, 'w') as f:
    writer = csv.writer(f, delimiter=',')
    # write column names 
    writer.writerow(('index',
                     'comment_time',
                     'comment_id',
                     'movie_id',
                     'user_md5',
                     'rating',
                     'votes',
                     'content',
                     'pred_label'
                     ))
    for idx, row in tqdm(example_df.iterrows()):
            writer.writerow((idx,
                             row['comment_time'],
                             row['comment_id'],
                             row['movie_id'],
                             row['user_md5'],
                             row['rating'],
                             row['votes'],
                             row['content'],
                             row['pred_label']
                             ))

In [None]:
file_dir = f'{os.getcwd()}/comments_cleaned.csv'
write_csv_2(comments, file_dir)