In [None]:
import pandas as pd
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, pipeline
import numpy as np
import evaluate
import sklearn
from sklearn.model_selection import train_test_split
import math
import random
import os
from model_training_and_evaluation import *
from model_testing import *

# Active Learning

This notebook will be used to perform active learning to create labels for the dataset. If you ever want to reuse this code you might want to do some of the following changes:
- Update the corrected file after each label you manually set, so that if the code fails along the way what you have labeled is not lost. Then the code will also be updated so that when you start running the code again you can just continue where you stopped. 
- Specifically updating/saving the yet uncorrected examples at least after every manual labeling, or more frequently as the point above.

## Help functions

In [None]:
def correct_most_uncertain_labels(full_df: pd.DataFrame, num_to_correct: int) -> pd.DataFrame:
    """
    Function to correct the most uncertain labels in a dataframe. 
    Args:
        full_df: a dataframe on the format |Text|Label|Score|
        num_to_correct: the number of labels that should be corrected
    Returns:
        The corrected dataframe and a dataframe consisting of the entries that were not corrected, so
        that future prediction is not done on the corrected examples
    """

    # Sort and find the most uncertain email labels
    full_df_sorted = full_df.sort_values('score')
    n_most_uncertain = full_df_sorted.head(num_to_correct)
    most_certain = full_df_sorted.tail(full_df.shape[0] - num_to_correct)

    # Fix the most uncertain labels
    for index, row in n_most_uncertain.iterrows():
        print(f'label er: {row["label"]} og text er: \n{row["text"]}')
        correct_label = input('correct label: ')
        correct_label = 0 if correct_label == '0' else 1
        n_most_uncertain.loc[index, 'label'] = correct_label
        print('\n')

    # Add the corrected labels to the corrected.csv file
    if os.path.isfile('./corrected.csv'):
        # Add new rows to csv if there already exist a csv with this name
        corrected = pd.read_csv('./corrected.csv')
        corrected =  pd.concat([corrected, n_most_uncertain], ignore_index=True)
        corrected.to_csv('./corrected.csv', index=False)
    else:
        # If this is the first time correcting labels, no file exist yet with this name
        corrected = n_most_uncertain
        corrected.to_csv('./corrected.csv', index=False)
    
    return corrected, most_certain

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=512)

## Prediction 1 with 0 examples

Transfer learning will be used to make initial predictions on the dataset. I will use an NB-BERT model trained on the NoReC dataset for this purpose. This model can be loaded from huggingface, however the VM does not allow for downloading models trough the code, so it must first be downloaded manually and saved to a folder, then loaded through the code. 

Note: the numbers in e.g. results_0 or corrected_50 says how many examples have been used for training so far, meaning that it starts at 0 and increases after each training by the number of additional examples used for training that round. 

In [None]:
# Do predictions using the model and save them as pred_0.csv as well in case the kernel closes
preprocessed_emails = pd.read_csv('preprocessed_emails_10k_2023_rensket')
results_0 = predict_from_fine_tuned_model('./models/nb-bert_LR5e-05_WR0.1_OPTIMadamw_hf_WD0', list(preprocessed_emails['email']))
results_0.to_csv('./temp_pred_during_al/pred_0.csv', index=False)

# Extract 50 most uncertain examples (remove them from pred_0 as well)
# Correct those examples and save as the manually labeled dataset
corrected_0, not_corrected_0 = correct_most_uncertain_labels(results_0, 50)

## Prediction 2 with new dataset containing 50 examples

Now that a small dataset has been manually labeled, the model can be further trained on that data and predictions can be tried again.

In [None]:
# tokenize current training dataset
corrected_0_dataset = load_dataset('csv', data_files='./corrected.csv')
tokenizer = AutoTokenizer.from_pretrained('./models/nb-bert_LR5e-05_WR0.1_OPTIMadamw_hf_WD0')
init_corrected_0_encoding = corrected_0_dataset.map(
    tokenize_function,
    batched=True
)
corrected_0_encoded = init_corrected_0_encoding['train']

# train model. nb-bert_LR5e-05_WR0.1_OPTIMadamw_hf_WD0 is the model trained on this VM
# NOTE: this model has been overwritten after the AL process was performed by a model actually trained on
# the final dataset from this process, meaning that this model is no longer the one trained on the 
# NoReC dataset. 
model_path_50 = create_and_train_model('./models/nb-bert_LR5e-05_WR0.1_OPTIMadamw_hf_WD0',
                      'nb-bert',
                      corrected_0_encoded,
                      al=True,
                      num_al_examples=50)

# Load model trained on 50 examples to do predictions using the model and save them as pred_50
results_50 = predict_from_fine_tuned_model(model_path_50, list(not_corrected_0['text']))
results_50.to_csv('./temp_pred_during_al/pred_50.csv', index=False)

# Extract 50 most uncertain examples (remove them from pred_50 as well)
# Correct those examples and add to the manually labeled dataset
corrected_50, not_corrected_50 = correct_most_uncertain_labels(results_50, 50)

## Prediction 3 with 100 examples

In [None]:
# tokenize current training dataset
corrected_50_dataset = load_dataset('csv', data_files='./corrected.csv')
tokenizer = AutoTokenizer.from_pretrained('./models/models_al/nb-bert_ex50')
init_corrected_50_encoding = corrected_50_dataset.map(
    tokenize_function,
    batched=True
)
corrected_50_encoded = init_corrected_50_encoding['train']

# train model 
model_path_100 = create_and_train_model('./models/models_al/nb-bert_ex50',
                      'nb-bert',
                      corrected_50_encoded,
                      al=True,
                      num_al_examples=100)

# Load model trained on 100 examples to do predictions using the model and save them as pred_100
results_100 = predict_from_fine_tuned_model(model_path_100, list(not_corrected_50['text']))
results_100.to_csv('./temp_pred_during_al/pred_100.csv', index=False)

# Extract 50 most uncertain examples (remove them from pred_100 as well)
# Correct those examples and add to the manually labeled dataset
corrected_100, not_corrected_100 = correct_most_uncertain_labels(results_100, 50)

## Prediction 4 with 150 examples

In [None]:
# tokenize current training dataset
corrected_100_dataset = load_dataset('csv', data_files='./corrected.csv')
tokenizer = AutoTokenizer.from_pretrained('./models/models_al/nb-bert_ex100')
init_corrected_100_encoding = corrected_100_dataset.map(
    tokenize_function,
    batched=True
)
corrected_100_encoded = init_corrected_100_encoding['train']

# train model 
model_path_150 = create_and_train_model('./models/models_al/nb-bert_ex100',
                      'nb-bert',
                      corrected_100_encoded,
                      al=True,
                      num_al_examples=150)

# Load model trained on 150 examples to do predictions using the model and save them as pred_150
results_150 = predict_from_fine_tuned_model(model_path_150, list(not_corrected_100['text']))
results_150.to_csv('./temp_pred_during_al/pred_150.csv', index=False)

# Extract 50 most uncertain examples (remove them from pred_150 as well)
# Correct those examples and add to the manually labeled dataset
corrected_150, not_corrected_150 = correct_most_uncertain_labels(results_150, 50)

## Prediction 5 with 200 examples

In [None]:
# tokenize current training dataset
corrected_150_dataset = load_dataset('csv', data_files='./corrected.csv')
tokenizer = AutoTokenizer.from_pretrained('./models/models_al/nb-bert_ex150')
init_corrected_150_encoding = corrected_150_dataset.map(
    tokenize_function,
    batched=True
)
corrected_150_encoded = init_corrected_150_encoding['train']

# train model 
model_path_200 = create_and_train_model('./models/models_al/nb-bert_ex150',
                      'nb-bert',
                      corrected_150_encoded,
                      al=True,
                      num_al_examples=200)

# Load model trained on 150 examples to do predictions using the model and save them as pred_200
results_200 = predict_from_fine_tuned_model(model_path_200, list(not_corrected_150['text']))
results_200.to_csv('./temp_pred_during_al/pred_200.csv', index=False)

# Extract 50 most uncertain examples (remove them from pred_200 as well)
# Correct those examples and add to the manually labeled dataset
corrected_200, not_corrected_200 = correct_most_uncertain_labels(results_200, 50)

## Prediction 6 with 250 examples

In [None]:
# tokenize current training dataset
corrected_200_dataset = load_dataset('csv', data_files='./corrected.csv')
tokenizer = AutoTokenizer.from_pretrained('./models/models_al/nb-bert_ex200')
init_corrected_200_encoding = corrected_200_dataset.map(
    tokenize_function,
    batched=True
)
corrected_200_encoded = init_corrected_200_encoding['train']

# train model 
model_path_250 = create_and_train_model('./models/models_al/nb-bert_ex200',
                      'nb-bert',
                      corrected_200_encoded,
                      al=True,
                      num_al_examples=250)

# Load model trained on 200 examples to do predictions using the model and save them as pred_250
results_250 = predict_from_fine_tuned_model(model_path_250, list(not_corrected_200['text']))
results_250.to_csv('./temp_pred_during_al/pred_250.csv', index=False)

# Extract 50 most uncertain examples (remove them from pred_250 as well)
# Correct those examples and add to the manually labeled dataset
corrected_250, not_corrected_250 = correct_most_uncertain_labels(results_250, 50)

## Prediction 7 with 300 examples

In [None]:
# tokenize current training dataset
corrected_250_dataset = load_dataset('csv', data_files='./corrected.csv')
tokenizer = AutoTokenizer.from_pretrained('./models/models_al/nb-bert_ex250')
init_corrected_250_encoding = corrected_250_dataset.map(
    tokenize_function,
    batched=True
)
corrected_250_encoded = init_corrected_250_encoding['train']

# train model 
model_path_300 = create_and_train_model('./models/models_al/nb-bert_ex250',
                      'nb-bert',
                      corrected_250_encoded,
                      al=True,
                      num_al_examples=300)

# Load model trained on 300 examples to do predictions using the model and save them as pred_300
results_300 = predict_from_fine_tuned_model(model_path_300, list(not_corrected_250['text']))
results_300.to_csv('./temp_pred_during_al/pred_300.csv', index=False)

In [None]:
# Extract 100 most uncertain examples (remove them from pred_250 as well)
# Correct those examples and add to the manually labeled dataset
corrected_300, not_corrected_300 = correct_most_uncertain_labels(results_300, 100)

## Prediction 8 with 400 examples

In [None]:
# tokenize current training dataset
corrected_300_dataset = load_dataset('csv', data_files='./corrected.csv')
tokenizer = AutoTokenizer.from_pretrained('./models/models_al/nb-bert_ex300')
init_corrected_300_encoding = corrected_300_dataset.map(
    tokenize_function,
    batched=True
)
corrected_300_encoded = init_corrected_300_encoding['train']

# train model 
model_path_400 = create_and_train_model('./models/models_al/nb-bert_ex300',
                      'nb-bert',
                      corrected_300_encoded,
                      al=True,
                      num_al_examples=400)

# Load model trained on 400 examples to do predictions using the model and save them as pred_400
results_400 = predict_from_fine_tuned_model(model_path_400, list(not_corrected_300['text']))
results_400.to_csv('./temp_pred_during_al/pred_400.csv', index=False)

# Extract 100 most uncertain examples (remove them from pred_400 as well)
# Correct those examples and add to the manually labeled dataset
corrected_400, not_corrected_400 = correct_most_uncertain_labels(results_400, 100)

## Prediction 9 with 500 examples

In [None]:
# tokenize current training dataset
corrected_400_dataset = load_dataset('csv', data_files='./corrected.csv')
tokenizer = AutoTokenizer.from_pretrained('./models/models_al/nb-bert_ex400')
init_corrected_400_encoding = corrected_400_dataset.map(
    tokenize_function,
    batched=True
)
corrected_400_encoded = init_corrected_400_encoding['train']

# train model 
model_path_500 = create_and_train_model('./models/models_al/nb-bert_ex400',
                      'nb-bert',
                      corrected_400_encoded,
                      al=True,
                      num_al_examples=500)

# Load model trained on 500 examples to do predictions using the model and save them as pred_500
results_500 = predict_from_fine_tuned_model(model_path_500, list(not_corrected_400['text']))
results_500.to_csv('./temp_pred_during_al/pred_500.csv', index=False)

## Sort data and save final dataset

After looking at the dataset created by training on 400 and 500 examples, I thought the one trained on 400 looked a bit better, so I decided to go with that. Most likely, the dataset trained on 500 examples overfitted to the data, so it was not able to see that negative emails about other things than insurance/interests could actually be negative. 

In [None]:
results_400.sort_values('score', inplace=True)
results_400.head()

In [None]:
results_400.value_counts('label')

In [None]:
results_400.to_csv('./temp_pred_during_al/pred_400_sorted.csv', index=False)

## Create dataset to be corrected manually by employees at SMN

The people at SMN will only be correcting the 100 most and 100 least certain examples, so these will be extracted. Since there seem to be a pattern in which examples the model is most certain of, the examples are shuffled. Also, I noticed that among the most certain examples, pretty much everyone were marked as positive. Therefore, the dataset to correct is balanced, so that the 50 most certain and 50 least certain positive examples, and the 50 most certain and 50 least certain negative examples are corrected. 

In [None]:
pos_examples = results_400[results_400['label'] == 'LABEL_1']
neg_examples = results_400[results_400['label'] == 'LABEL_0']
pos_50_least_certain = pos_examples.head(50)
pos_50_most_certain = pos_examples.tail(50)
neg_50_least_certain = neg_examples.head(50)
neg_50_most_certain = neg_examples.tail(50)
df_for_correction = pd.concat([pos_50_least_certain, pos_50_most_certain, neg_50_least_certain, neg_50_most_certain])
df_for_correction = df_for_correction.sample(frac=1)
df_for_correction.drop(['score'], axis=1, inplace=True)
df_for_correction.to_csv('./temp_pred_during_al/predictions_to_be_corrected.csv', index=False)
df_for_correction

## Use the final manually labeled dataset for training as well

The dataset that was manually labeled by employees will now be added to the dataset of manually labeled examples and one more round of training and predictions will be done. 

#### Add the dataset labeled by SMN to previous corrected, and fix format

In [None]:
# load dataset labeled by SMN
neg_dataset_corrected_by_smn = pd.read_csv('./100_negative_examples_labeled_by_smn_employees.csv', usecols=[0,1,2], sep=';')
pos_dataset_corrected_by_smn = pd.read_csv('./100_positive_examples_labeled_by_smn_employees.csv', usecols=[0,1,2], sep=';')

# Change label if it was wrong
neg_dataset_corrected_by_smn.loc[neg_dataset_corrected_by_smn['Vurdering'] == 'Feil', 'Kategori'] = 'LABEL_1'
pos_dataset_corrected_by_smn.loc[pos_dataset_corrected_by_smn['Vurdering'] == 'Feil', 'Kategori'] = 'LABEL_0'

# Rename columns
neg_dataset_corrected_by_smn.rename(columns={'Tekst':'text', 'Kategori':'label'}, inplace=True)
pos_dataset_corrected_by_smn.rename(columns={'Tekst':'text', 'Kategori':'label'}, inplace=True)

# Remove the 'Vurdering' column
neg_dataset_corrected_by_smn.drop('Vurdering', inplace=True, axis=1)
pos_dataset_corrected_by_smn.drop('Vurdering', inplace=True, axis=1)

# Add to previously labeled dataset and save the final dataset
# NB: If you run this again it will technically be wrong since the corrected.csv file has been updated after this
# line of code was ran (see further down).
corrected = pd.read_csv('./corrected.csv')
corrected_final =  pd.concat([corrected, neg_dataset_corrected_by_smn, pos_dataset_corrected_by_smn], ignore_index=True)

# Turn LABEL_1 to 1 and LABEL_0 to 0 to match the format of previous corrected emails
corrected_final.loc[corrected_final['label'] == 'LABEL_0', 'label'] = 0
corrected_final.loc[corrected_final['label'] == 'LABEL_1', 'label'] = 1

corrected_final.to_csv('./corrected.csv', index=False)

corrected_final

#### Create dataset for predicitons later

Unfortunately, the final dataset where all examples in corrected and the ones corrected by Sparebank 1 SMN are present is not saved, so I will have to start with pred_400 and from there remove all examples that also happen to be in corrected, as predictions should not be done on examples seen during training. 

In [None]:
pred_400 = pd.read_csv('temp_pred_during_al/pred_400.csv')

rows_to_delete = []
for index, row in pred_400.iterrows():
    if row['text'] in list(corrected_final['text']):
        rows_to_delete.append(index)

df_for_final_prediction = pred_400.drop(rows_to_delete)
df_for_final_prediction

#### Final training and prediciton

In [None]:
# Turn dataframe to huggingface dataset
corrected_final_dataset = Dataset.from_pandas(corrected_final)
print(corrected_final_dataset)


# tokenize current training dataset
tokenizer = AutoTokenizer.from_pretrained('./models/models_al/nb-bert_ex400')
corrected_final_encoded = corrected_final_dataset.map(
    tokenize_function,
    batched=True
)

# train model. nb-bert_ex400 is the previously trained model that created the labels for the dataset that was corrected by 
# SMN employees
model_path_600 = create_and_train_model('./models/models_al/nb-bert_ex400',
                      'nb-bert',
                      corrected_final_encoded,
                      al=True,
                      num_al_examples=600)

# Load model trained on 600 examples to do predictions on uncorrected examples using the model and save them as pred_600
results_600 = predict_from_fine_tuned_model(model_path_600, list(df_for_final_prediction['text']))
results_600.to_csv('./temp_pred_during_al/pred_600.csv', index=False)

#### Examine the final dataset

In [None]:
results_600 = pd.read_csv('./temp_pred_during_al/pred_600.csv')
results_600.value_counts('label')

Something has obviously gone wrong here (probably overfitting). There are definetely too few negative emails, and this dataset has so few negative emails that it will be completely useless to train a BERT model. Seems like the best solution is to use pred_400, as that is better.