# Set Up

## Imports 

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

!pip install transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding

!pip install datasets
from datasets import Dataset, DatasetDict

import csv

## Load our Dataset

In [None]:
# Load our two datasets
# 1. IceErrorCorpus (IEC)
correct_data = pd.read_csv("./generated_datasets/labeled_data.csv", encoding='latin-1')
correct_data = correct_data.loc[correct_data['label'] == 'correct']
correct_data = correct_data.drop(['Error'], axis=1)

In [None]:
# 2. Our synthetic dataset generated from IEC and BIN
error_data = pd.read_csv("./generated_datasets/synthetic_data.csv", encoding='UTF-8')
error_data = error_data.drop(['type'], axis=1)
error_data = error_data.drop(['error position'], axis=1)
error_data  = error_data.drop_duplicates()

In [None]:
# Combine the correct and synthetic datasets
data = pd.concat([correct_data, error_data])
# Replace  the labels with numeric values
data['label'] = data['label'].replace(['correct','incorrect'],[0,1]) 
# Split the data into training and testing sets
train, test = train_test_split(data, test_size=0.2)

### Explore our Data Set

In [None]:
train.head()

In [None]:
train.describe()

### Create a Dataset dictionary

In [None]:
train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test)
 
dataset = DatasetDict()
 
dataset['train'] = train
dataset['test'] = test

dataset = dataset.remove_columns("__index_level_0__")
dataset

## Configure Model

In [None]:
id2label = {0: "correct", 1: "incorrect"}
label2id = {"incorrect": 0, "correct": 1}

# Model Architecture
myTokenizer = AutoTokenizer.from_pretrained("mideind/IceBERT")
myModel = AutoModelForSequenceClassification.from_pretrained("mideind/IceBERT", num_labels=2, id2label=id2label, label2id=label2id)
classifier = pipeline("text-classification", model=myModel, tokenizer=myTokenizer)

# Tokenizer
def preprocess_function(examples):
    return myTokenizer(examples["text"], truncation=True)
dataset = dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=myTokenizer)

# Training Args
training_args = TrainingArguments(
    output_dir="trainingArguments",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Trainer
trainer = Trainer(
    model=myModel,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=myTokenizer,
    data_collator=data_collator,
)

# Train our model

In [None]:
trainer.train()

trainer.save_model("refactored_model_v2_non_nouns")

# Sanity check our model

In [None]:
classifier = pipeline("text-classification", model=myModel, tokenizer=myTokenizer)

# I went to "Torfi's" last night and had a beer.
test1 = classifier("Ég fór til Torfi í gær og fékk mér bjór.")
test2 = classifier("Ég fór til Torfa í gær og fékk mér bjór.")
print(test1)
print(test2)
print("---")
# The stable is painted blue.
res1 = classifier("Hestahúsið er málaður blátt.")
res2 = classifier("Hestahúsið er málað blátt.")
print(res1)
print(res2)
print("---")
# My captain can't drive very well, but she owns a red dog.
res1 = classifier("Skipstjórinn minn kann ekki að keyra mjög vel en hún á rauður hund.")
res2 = classifier("Skipstjórinn minn kann ekki að keyra mjög vel en hún á rauðan hund.")
print(res1)
print(res2)
print("---")
# I drive my car every day.
res1 = classifier("Ég keyri bíllinn minn á hverjum einasta degi.")
res2 = classifier("Ég keyri bílinn minn á hverjum einasta degi.")
print(res1)
print(res2)
print("---")
# I drive my car every day
res1 = classifier("Ég keyri bílinn mínum á hverjum einasta degi.")
res2 = classifier("Ég keyri bílinn minn á hverjum einasta degi.")
print(res1)
print(res2)

# Evaluate our model

In [None]:
truePos = []
trueNeg = []
falsePos = []
falseNeg = []

# Open the csv file, and read each sentence
with open('./generated_datasets/April/synthetic_validation_set.csv') as csv_file:
    my_file = csv.reader(csv_file)
    next(my_file, None)  # skip the headers
    for line in my_file:
        prediction = classifier(line[0])
        if prediction[0]['label'] == 'correct':
            if line[1] == 'correct':
                truePos.append(prediction)
            elif line[1] == 'incorrect':
                falsePos.append(prediction)
            else:
                print("Unexpected Error code 001")
        elif prediction[0]['label'] == 'incorrect':
            if line[1] == 'incorrect':
                 trueNeg.append(prediction)
            elif line[1] == 'correct':
                 falseNeg.append(prediction)
            else:
                print("Unexpected Error code 002")
        else:
            print("Unexpected Error code 003")
    
print("True Positives:  ",len(truePos))
print("True Negatives:  ",len(trueNeg))
print("False Positives: ",len(falsePos))
print("False Negatives: ",len(falseNeg))

accuracy = (len(truePos)+len(trueNeg))/(len(truePos)+len(trueNeg)+len(falsePos)+len(falseNeg))
precision = (len(truePos))/(len(truePos)+len(falsePos))
recall = (len(truePos))/(len(truePos)+len(falseNeg))
f1 = 2* ((precision*recall)/(precision+recall))

print("Accuracy: ",accuracy)
print("Precision: ",precision)
print("Recall: ",recall)
print("F1-score: ",f1)