# Transformer Based Models and Prediction

This file contains code to create transformer based models and do predictions. 

In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, pipeline
import evaluate
import math
import time
import sklearn
from sklearn.model_selection import train_test_split
from model_training_and_evaluation import *
from model_testing import *

## Create datasets

Because of time-contraints, the models will only be tested with one set of parameters, and the final result of each BERT model will be compared to eachother. However no evaluation will be done to tune the parameters. Therefore, a validation dataset will not be created. 

In [None]:
# Load dataset and split into train and test dataset. pred_400 is the final labeled dataset from the AL process
full_df = pd.read_csv('./temp_pred_during_al/pred_400.csv')

# Create train and test dataframes
train_df, test_df = train_test_split(full_df, test_size=0.20, random_state=1, stratify=full_df[['label']])

# Create balanced train dataset
train_neg = train_df[train_df['label'] == 'LABEL_0']
train_pos = train_df[train_df['label'] == 'LABEL_1'].sample(n=len(train_neg), random_state=0)
train = pd.concat([train_neg, train_pos])

train.loc[train['label'] == 'LABEL_0', 'label'] = 0
train.loc[train['label'] == 'LABEL_1', 'label'] = 1

# Create eval dataframe
test, evaluate = train_test_split(test_df, test_size=0.20, random_state=1, stratify=test_df[['label']])

evaluate.loc[evaluate['label'] == 'LABEL_0', 'label'] = 0
evaluate.loc[evaluate['label'] == 'LABEL_1', 'label'] = 1

# For huggingface transformers, the pandas dataframe must be turned into a huggingface dataset
train_ds = Dataset.from_pandas(train)
print(train_ds)

eval_ds = Dataset.from_pandas(evaluate)
print(eval_ds)

In [None]:
test.value_counts('label')

In [None]:
evaluate.value_counts('label')

In [None]:
train.value_counts('label')

In [None]:
train.to_csv('./emails_train_balanced.csv')
test.to_csv('./emails_test.csv')
evaluate.to_csv('./emails_eval.csv')

## Train models and make predictions

Now 4 different BERT models will be trained and then used to make predictions on the test dataset. I will use the same parameters that gave the best results on the NoReC dataset, because I don't have time to fine-tuned the parameters on this computer's CPU. I will run all models for 5 epocsh and use the best one for testing. 

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=512)

## distilmBERT

In [None]:
# Tokenize dataset
tokenizer = AutoTokenizer.from_pretrained('./models/distilmBERT-original')
distilmbert_train_encoding = train_ds.map(
    tokenize_function,
    batched=True
)
distilmbert_eval_encoding = eval_ds.map(
    tokenize_function,
    batched=True
)

# Train model for 5 epochs
fine_tuned_distilmbert_dir, info_from_distilmbert_training = create_and_train_model(
    model_path='./models/distilmBERT-original',
    model_name='distilmbert',
    training_data=distilmbert_train_encoding,
    eval_data=distilmbert_eval_encoding,
    epochs=5,
    learning_rate=3e-05,
    warmup_ratio=0.01,
    optimizer='adamw_hf',
    weight_decay=0.01,
)

print(f'info_from_distilmbert_training: {info_from_distilmbert_training}')

# Make predictions and check prediction time
start_time = time.time()
distilmbert_results = predict_from_fine_tuned_model(fine_tuned_distilmbert_dir, list(test['text']))
print(f"Time used for prediction was: {time.time() - start_time}")

# Because report_evaluation expects the actual labels to be a list of 0 and 1, this is neccessary:
test.loc[test['label'] == 'LABEL_0', 'label'] = 0
test.loc[test['label'] == 'LABEL_1', 'label'] = 1

# Get scores/results
report_evaluation(distilmbert_results['label'], list(test['label']), 'distilmbert')

## NB-BERT

In [None]:
# Tokenize dataset
tokenizer = AutoTokenizer.from_pretrained('./models/NB-BERT-original')
nb_bert_train_encoding = train_ds.map(
    tokenize_function,
    batched=True
)
nb_bert_eval_encoding = eval_ds.map(
    tokenize_function,
    batched=True
)

# Train model for 5 epochs
fine_tuned_nb_bert_dir, info_from_nb_bert_training = create_and_train_model(
    model_path='./models/NB-BERT-original',
    model_name='nb-bert',
    training_data=nb_bert_train_encoding,
    eval_data=nb_bert_eval_encoding,
    epochs=5,
    learning_rate=5e-05,
    warmup_ratio=0.1,
    optimizer='adamw_hf',
    weight_decay=0,
)

print(f'info_from_distilmbert_training: {info_from_nb_bert_training}')

# Make predictions and check prediction time
start_time = time.time()
nb_bert_results = predict_from_fine_tuned_model(fine_tuned_nb_bert_dir, list(test['text']))
print(f"Time used for prediction was: {time.time() - start_time}")

# Because report_evaluation expects the actual labels to be a list of 0 and 1, this is neccessary:
test.loc[test['label'] == 'LABEL_0', 'label'] = 0
test.loc[test['label'] == 'LABEL_1', 'label'] = 1

# Get scores/results
report_evaluation(nb_bert_results['label'], list(test['label']), 'nb-bert')

## NorBERT

In [None]:
# Tokenize dataset
tokenizer = AutoTokenizer.from_pretrained('./models/NorBERT-original')
norbert_train_encoding = train_ds.map(
    tokenize_function,
    batched=True
)
norbert_eval_encoding = eval_ds.map(
    tokenize_function,
    batched=True
)

# Train model for 5 epochs
fine_tuned_norbert_dir, info_from_norbert_training = create_and_train_model(
    model_path='./models/NorBERT-original',
    model_name='norbert',
    training_data=norbert_train_encoding,
    eval_data=norbert_eval_encoding,
    epochs=5,
    learning_rate=5e-05,
    warmup_ratio=0,
    optimizer='adamw_hf',
    weight_decay=0.01,
)

print(f'info_from_norbert_training: {info_from_norbert_training}')

# Make predictions and check prediction time
start_time = time.time()
norbert_results = predict_from_fine_tuned_model(fine_tuned_norbert_dir, list(test['text']))
print(f"Time used for prediction was: {time.time() - start_time}")

# Because report_evaluation expects the actual labels to be a list of 0 and 1, this is neccessary:
test.loc[test['label'] == 'LABEL_0', 'label'] = 0
test.loc[test['label'] == 'LABEL_1', 'label'] = 1

# Get scores/results
report_evaluation(norbert_results['label'], list(test['label']), 'norbert')

## mBERT

In [None]:
# Tokenize dataset
tokenizer = AutoTokenizer.from_pretrained('./models/mBERT-original')
mbert_train_encoding = train_ds.map(
    tokenize_function,
    batched=True
)
mbert_eval_encoding = eval_ds.map(
    tokenize_function,
    batched=True
)

# Train model for 5 epochs
fine_tuned_mbert_dir, info_from_mbert_training = create_and_train_model(
    model_path='./models/mBERT-original',
    model_name='mbert',
    training_data=mbert_train_encoding,
    eval_data=mbert_eval_encoding,
    epochs=5,
    learning_rate=3e-05,
    warmup_ratio=0.1,
    optimizer='adamw_hf',
    weight_decay=0.1,
)

print(f'info_from_mbert_training: {info_from_mbert_training}')

# Make predictions and check prediction time
start_time = time.time()
mbert_results = predict_from_fine_tuned_model(fine_tuned_mbert_dir, list(test['text']))
print(f"Time used for prediction was: {time.time() - start_time}")

# Because report_evaluation expects the actual labels to be a list of 0 and 1, this is neccessary:
test.loc[test['label'] == 'LABEL_0', 'label'] = 0
test.loc[test['label'] == 'LABEL_1', 'label'] = 1

# Get scores/results
report_evaluation(mbert_results['label'], list(test['label']), 'mbert')

## Save the predictions

In [None]:
distilmbert_results.to_csv('./scores/model_predictions/distilmbert_results.csv', index=False)
nb_bert_results.to_csv('./scores/model_predictions/nb_bert_results.csv', index=False)
norbert_results.to_csv('./scores/model_predictions/norbert_results.csv', index=False)
mbert_results.to_csv('./scores/model_predictions/mbert_results.csv', index=False)

# Test the best model on a new dataset

Now the best model (NorBERT) will be tested on a new dataset, and the results from that dataset will be handed to employees at SMN. They will give qualitative feedback on wether or not the results from a language model can be useful for prioritizing incoming emails. 

In [None]:
small_df = pd.read_csv('./preprocessed_emails_with_all_columns.csv')
norbert_results_on_small_df = predict_from_fine_tuned_model('./models/norbert_LR5e-05_WR0_OPTIMadamw_hf_WD0.01', list(small_df['description']))

In [None]:
small_df['label'] = list(norbert_results_on_small_df['label'])
small_df['score'] = list(norbert_results_on_small_df['score'])
small_df.to_csv('./new_small_email_dataset_with_predictions_all_columns.csv', index=False)

In [None]:
small_df.drop('Unnamed: 0', axis=1, inplace=True)
small_df

In [None]:
small_df.info()

In [None]:
norbert_results_on_small_df.value_counts('label')