In [None]:
!pip install transformers
!pip install datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [None]:
import random
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, Dataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F
from transformers import BertTokenizer,BertModel, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from collections import defaultdict

from sklearn.metrics import confusion_matrix, classification_report,precision_score,accuracy_score,f1_score
from sklearn.model_selection import StratifiedKFold


import pandas as pd
import spacy
import os
import sys

import logging
logging.basicConfig(level = logging.ERROR)


from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  from tqdm._tqdm_notebook import tqdm_notebook


In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def load_vikiwiki_dataset(path: str, label: str|int, difficulty: str, lang: str = "en") -> pd.DataFrame:
    cols = ["content", "labels", "difficulty"]
    result = []
    load_path = path
    if os.path.isdir(path):
        dir_file_list = os.listdir(path)
        dir_file_list.sort()
        if lang in dir_file_list:
            load_path = os.path.join(load_path, lang)
            dir_file_list = os.listdir(load_path)
            dir_file_list.sort()
        for file in dir_file_list:
            with open(os.path.join(load_path, file), "r") as f:
                data = "".join(f.readlines())
            if len(data) > 0:
                result.append([data, label, difficulty])
    return pd.DataFrame(
        data=result,
        columns=cols
    )

In [None]:
df_ca = pd.concat(
    [
        load_vikiwiki_dataset("/content/drive/MyDrive/data/vikidia", 0, "beginner", "ca"),
        load_vikiwiki_dataset("/content/drive/MyDrive/data/wikipedia", 1, "advanced", "ca")
    ],
    axis = 0,
    ignore_index = True
)

df_en = pd.concat(
    [
        load_vikiwiki_dataset("/content/drive/MyDrive/data/vikidia", 0, "beginner", "en"),
        load_vikiwiki_dataset("/content/drive/MyDrive/data/wikipedia", 1, "advanced", "en")
    ],
    axis = 0,
    ignore_index = True
)

df_es = pd.concat(
    [
        load_vikiwiki_dataset("/content/drive/MyDrive/data/vikidia", 0, "beginner", "es"),
        load_vikiwiki_dataset("/content/drive/MyDrive/data/wikipedia", 1, "advanced", "es")
    ],
    axis = 0,
    ignore_index = True
)

df_eu = pd.concat(
    [
        load_vikiwiki_dataset("/content/drive/MyDrive/data/vikidia", 0, "beginner", "eu"),
        load_vikiwiki_dataset("/content/drive/MyDrive/data/wikipedia", 1, "advanced", "eu")
    ],
    axis = 0,
    ignore_index = True
)

df_fr = pd.concat(
    [
        load_vikiwiki_dataset("/content/drive/MyDrive/data/vikidia", 0, "beginner", "fr"),
        load_vikiwiki_dataset("/content/drive/MyDrive/data/wikipedia", 1, "advanced", "fr")
    ],
    axis = 0,
    ignore_index = True
)

df_it = pd.concat(
    [
        load_vikiwiki_dataset("/content/drive/MyDrive/data/vikidia", 0, "beginner", "it"),
        load_vikiwiki_dataset("/content/drive/MyDrive/data/wikipedia", 1, "advanced", "it")
    ],
    axis = 0,
    ignore_index = True
)


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(device)

cpu


In [None]:
num_diff = df_en.labels.unique()
diff = ['beginner', 'advanced']

label = dict(zip(num_diff, diff))

diff = list(diff)


Data Preprocessing

In [None]:
import re

In [None]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",

                           "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",

                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",

                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",

                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",

                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",

                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",

                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",

                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",

                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",

                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",

                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",

                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",

                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",

                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",

                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",

                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",

                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",

                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",

                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",

                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",

                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",

                           "you're": "you are", "you've": "you have"}

In [None]:

def clean_text(text):
    #text=' '.join([contraction_mapping[i] if i in contraction_mapping.keys() else i for i in text.split()])
    text=re.sub("'s","",text)
    if text.split()[0] == 'Intermediate':
      text = ' '.join(text.split()[1:])
    text=' '.join([i for i in text.split() if i.isalpha()])
    text=re.sub('[^a-zA-Z]'," ",text)


    return text

In [None]:
df_en['content'] = df_en['content'].apply(clean_text)


In [None]:
df_en

Unnamed: 0,content,labels,Difficulty
0,When you see the word the first thing you thin...,0,beginner
1,To Amsterdam still seems very Recently the May...,0,beginner
2,a music star from has millions of but she is a...,0,beginner
3,Google has made maps of the highest the ocean ...,0,beginner
4,The auction of a Banksy painting that disappea...,0,beginner
...,...,...,...
562,In typical Dan Sullivan was late to breakfast ...,2,advanced
563,Thousands of people protested on beaches again...,2,advanced
564,Race engineer A race engineer liaises between ...,2,advanced
565,More than one million British workers might be...,2,advanced


sentence lenth

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
datasets = [df_en, df_es, df_fr, df_it]
langs = ["english", "spanish", "french", "italian"]
#[df_ca, df_en, df_es, df_eu, df_fr, df_it]

In [None]:
for k, df in enumerate(datasets):
  sen_len = []
  for i in df['content']:
    curr_sen_len = []
    for j in sent_tokenize(i, language=langs[k]):
      curr_sen_len.append(len(word_tokenize(j, language=langs[k])))
    mean_len = np.mean(curr_sen_len)
    sen_len.append(mean_len)
  df['mean_sentence_len'] = sen_len

In [None]:
df_es

Unnamed: 0,content,labels,difficulty,mean_sentence_len
0,tk es dominio de primer nivel para Tokelau.\n,0,beginner,9.000000
1,"Doce (lat. ""duodecim"", dos más diez) es un núm...",0,beginner,15.500000
2,Trece es un número natural entre el doce y el ...,0,beginner,13.333333
3,Quince es un número natural entre el trece y e...,0,beginner,14.333333
4,\n,0,beginner,
...,...,...,...,...
829,El yo-yo es un juguete formado por un disco de...,1,advanced,24.960000
830,Zacatecas es uno de los treinta y un estados q...,1,advanced,33.501370
831,Zagreb escucha (?·i) es la capital y la ciuda...,1,advanced,24.462500
832,"Daucus carota subespecie sativus, llamada popu...",1,advanced,28.288889


In [None]:
dataset_mult = [df_ca, df_eu]
for k, df in enumerate(datasets):
  sen_len = []
  for i in df['content']:
    curr_sen_len = []
    for j in sent_tokenize(i):
      curr_sen_len.append(len(word_tokenize(j)))
    mean_len = np.mean(curr_sen_len)
    sen_len.append(mean_len)
  df['mean_sentence_len'] = sen_len

In [None]:
!python -m spacy download ca_core_news_sm
!python -m spacy download fr_core_news_sm
!python -m spacy download it_core_news_sm
!python -m spacy download es_core_news_sm

!python -m spacy download xx_ent_wiki_sm

2023-12-20 09:13:22.601221: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-20 09:13:22.601298: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-20 09:13:22.603548: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Collecting ca-core-news-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ca_core_news_sm-3.6.0/ca_core_news_sm-3.6.0-py3-none-any.whl (19.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.6/19.6 MB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ca-core-news-sm
Successfully insta

In [None]:
def walk_tree(node, depth):
    if node.n_lefts + node.n_rights > 0:
        return max(walk_tree(child, depth + 1) for child in node.children)
    else:
        return depth

def get_mean_tree_height(text):
  depths = []
  sentences = sent_tokenize(text)
  for sentence in sentences:
    doc = lang_nlp(sentence)
    depth = [walk_tree(sent.root, 0) for sent in doc.sents]
    depths.extend(depth)
  return np.mean(depths)

In [None]:
lang_nlp = spacy.load("ca_core_news_sm")
df_ca['mean_tree_height'] = df_ca['content'].progress_apply(get_mean_tree_height)

  0%|          | 0/856 [00:00<?, ?it/s]

In [None]:
lang_nlp = spacy.load('en_core_web_sm')
df_en['mean_tree_height'] = df_en['content'].progress_apply(get_mean_tree_height)

  0%|          | 0/864 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
lang_nlp = spacy.load("es_core_news_sm")
df_es['mean_tree_height'] = df_es['content'].progress_apply(get_mean_tree_height)

  0%|          | 0/834 [00:00<?, ?it/s]

In [None]:
lang_nlp = spacy.load("fr_core_news_sm")
df_fr['mean_tree_height'] = df_fr['content'].progress_apply(get_mean_tree_height)

  0%|          | 0/756 [00:00<?, ?it/s]

In [None]:
lang_nlp = spacy.load("it_core_news_sm")
df_it['mean_tree_height'] = df_it['content'].progress_apply(get_mean_tree_height)

In [None]:
lang_nlp = spacy.load("xx_ent_wiki_sm")
df_eu['mean_tree_height'] = df_eu['content'].progress_apply(get_mean_tree_height)

In [None]:
df_es

Unnamed: 0,content,labels,difficulty,mean_sentence_len,mean_tree_height
0,tk es dominio de primer nivel para Tokelau.\n,0,beginner,9.000000,3.000000
1,"Doce (lat. ""duodecim"", dos más diez) es un núm...",0,beginner,15.500000,3.000000
2,Trece es un número natural entre el doce y el ...,0,beginner,13.333333,4.333333
3,Quince es un número natural entre el trece y e...,0,beginner,14.333333,3.333333
4,\n,0,beginner,,
...,...,...,...,...,...
829,El yo-yo es un juguete formado por un disco de...,1,advanced,24.960000,4.461538
830,Zacatecas es uno de los treinta y un estados q...,1,advanced,33.504110,5.176781
831,Zagreb escucha (?·i) es la capital y la ciuda...,1,advanced,24.462500,4.172840
832,"Daucus carota subespecie sativus, llamada popu...",1,advanced,28.288889,4.604167


In [None]:
df_es = df_es.dropna()
df_es

Unnamed: 0,content,labels,difficulty,mean_sentence_len,mean_tree_height
0,tk es dominio de primer nivel para Tokelau.\n,0,beginner,9.000000,3.000000
1,"Doce (lat. ""duodecim"", dos más diez) es un núm...",0,beginner,15.500000,3.000000
2,Trece es un número natural entre el doce y el ...,0,beginner,13.333333,4.333333
3,Quince es un número natural entre el trece y e...,0,beginner,14.333333,3.333333
5,1955 fue un año del siglo XX de 365 días.\n,0,beginner,11.000000,2.000000
...,...,...,...,...,...
829,El yo-yo es un juguete formado por un disco de...,1,advanced,24.960000,4.461538
830,Zacatecas es uno de los treinta y un estados q...,1,advanced,33.504110,5.176781
831,Zagreb escucha (?·i) es la capital y la ciuda...,1,advanced,24.462500,4.172840
832,"Daucus carota subespecie sativus, llamada popu...",1,advanced,28.288889,4.604167


In [None]:
df_fr.isna().sum()

content              0
labels               0
difficulty           0
mean_sentence_len    0
mean_tree_height     0
dtype: int64

In [None]:
df_ca.isna().sum()

In [None]:
from transformers import BertTokenizer,BertModel, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup


In [None]:
class DiffDataset(Dataset):
    def __init__(self, contents, labels, tokenizer, max_len, feature_1=None, feature_2=None):
        self.contents = contents
        self.labels = labels
        self.tokenizer = tokenizer

        self.feature_1 = feature_1
        self.feature_2 = feature_2

        self.max_len = max_len

    def __len__(self):
        return len(self.contents)

    def __getitem__(self, item):
        content = str(self.contents[item])
        labels = self.labels[item]

        feature_1 = self.feature_1[item]
        feature_2 = self.feature_2[item]

        # Tokenizing the texts, while also including special tokens
        # for start and end of the text, as well as padding
        encoding = self.tokenizer.encode_plus(
          content,
          add_special_tokens=True,
          max_length=self.max_len,
          return_token_type_ids=False,
          pad_to_max_length=True,
          return_attention_mask=True,
          return_tensors='pt', # We return here the data as Pytorch Tensor
        )

        return {
          'content': content,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'labels': torch.tensor(labels, dtype=torch.long),
          'feature_1': torch.tensor([feature_1], dtype=torch.long),
          'feature_2': torch.tensor([feature_2], dtype=torch.long),
        }


In [None]:
torch.tensor([[12, 23, 45]], dtype=torch.long).shape

torch.Size([1, 3])

In [None]:
def generate_dataloader_1(df, tokenizer, max_len, batch_size):
    ds = DiffDataset(
        contents=df.content.to_numpy(),
        labels=df.labels.to_numpy(),

        feature_1 = df.mean_sentence_len.to_numpy(),
        feature_2 = df.mean_tree_height.to_numpy(),


        tokenizer=tokenizer,
        max_len=max_len,
      )
    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=2
      )

def generate_dataloader_2(df, tokenizer, max_len, batch_size):
    ds = DiffDataset(
        contents=df.content.to_numpy(),
        labels=df.labels.to_numpy(),

        feature_1 = df.mean_sentence_len.to_numpy(),
        feature_2 = df.avg_nounphrases.to_numpy(),


        tokenizer=tokenizer,
        max_len=max_len,
      )
    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=2
      )

def generate_dataloader_3(df, tokenizer, max_len, batch_size):
    ds = DiffDataset(
        contents=df.content.to_numpy(),
        labels=df.labels.to_numpy(),

        feature_1 = df.mean_tree_height.to_numpy(),
        feature_2 = df.avg_nounphrases.to_numpy(),


        tokenizer=tokenizer,
        max_len=max_len,
      )
    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=2
      )

In [None]:
#model_name = 'bert-base-multilingual-uncased'
model_name = 'bert-base-uncased'

In [None]:
tokenizer = BertTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
max_len = 256
batch_size = 8

# train_dataloader = generate_dataloader(df_train, tokenizer, max_len,batch_size)
# val_dataloader = generate_dataloader(df_val, tokenizer, max_len,batch_size)
# test_dataloader = generate_dataloader(df_test, tokenizer, max_len,batch_size)

#data = next(iter(train_dataloader))



In [None]:
class DiffClassifier(nn.Module):

    def __init__(self, n_classes):
        super(DiffClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        # Adding drop out, keeping 90% of the last neurons of the raw BERT model
        #self.drop = nn.Dropout(p=0.1)
        # The last linear layer for multiclass classification
        self.out = nn.Linear(self.bert.config.hidden_size+2, n_classes)

    # Forward propagation function
    def forward(self, input_ids, attention_mask, feature_1, feature_2):
        model_outs = self.bert(
          input_ids=input_ids,
          attention_mask=attention_mask
        )
        last_hidden_state = model_outs[0]

        pooled_output = model_outs[1]
        pooled_output = torch.cat((pooled_output, feature_1, feature_2) , dim=1)

        return self.out(pooled_output)


In [None]:
len(diff)

2

In [None]:
# Computing multiclass precision for the outputs of the model
def compute_precision(outputs, labels):
  op = outputs.cpu()
  la = labels.cpu()
  _, preds = torch.max(op, dim=1)
  # We choose 'weighted' averaging of the precision of each label because it takes into account the imbalance of labels in our tweets dataset
  # other viable averaging methods are 'micro'
  return torch.tensor(precision_score(la, preds, average='weighted',zero_division=0))

In [None]:
def train(model, train_dataloader,optimizer,scheduler, loss_fn, df_train):

        # Reset tracking variables at the beginning of each epoch
        precision, correct_predictions, batch_counts = 0, 0, 0
        losses = []

        # Put the model into the training mode
        model.train()


        # For each batch of training data...
        for d in train_dataloader:
              batch_counts +=1
              # Load batch to GPU
              input_ids = d["input_ids"].to(device)
              attention_mask = d["attention_mask"].to(device)
              labels = d["labels"].to(device)
              feature_1 = d["feature_1"].to(device)
              feature_2 = d["feature_2"].to(device)

              outputs = model(input_ids=input_ids,attention_mask=attention_mask, feature_1=feature_1, feature_2=feature_2)

              _, preds = torch.max(outputs, dim=1)

              # Compute loss and accumulate the loss values

              loss = loss_fn(outputs, labels)

              correct_predictions += torch.sum(preds == labels)
              losses.append(loss.item())
              precision +=  compute_precision(outputs, labels)

              loss.backward()

              # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
              torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
              # Update parameters and the learning rate
              optimizer.step()
              scheduler.step()
              optimizer.zero_grad()
          # Accuracy, loss, precision
        return correct_predictions.double() / len(df_train), np.mean(losses), precision/batch_counts

In [None]:
def eval(model, valid_dataloader, loss_fn, device, n):
    model = model.eval()


    correct_predictions , precision ,batch_counts = 0,0,0
    losses = []

    with torch.no_grad():
        for d in valid_dataloader:
            batch_counts += 1

            # Preparing inputs
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)
            feature_1 = d["feature_1"].to(device)
            feature_2 = d["feature_2"].to(device)

            # Running inference using the model
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                feature_1=feature_1,
                feature_2=feature_2,
              )

            # Running softmax on the outputs
            _, preds = torch.max(outputs, dim=1)

            # Computing loss function
            loss = loss_fn(outputs, labels)

            # Counting the correct occurences
            correct_predictions += torch.sum(preds == labels)

            # Computing the precision (true positives/true positives + false positives)
            # for each class and label, and find their average weighted by support
            precision += compute_precision(outputs,labels)

            losses.append(loss.item())
    # Accuracy, loss, precision
    return correct_predictions.double()/n, np.mean(losses), precision/batch_counts

In [None]:
def testing(model, dataloader):
  model = model.eval()

  contents = []
  predictions = []
  prediction_probs = []
  real_values = []

  with torch.no_grad():
    for d in dataloader:

      texts = d["content"]
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      labels = d["labels"].to(device)
      feature_1 = d["feature_1"].to(device)
      feature_2 = d["feature_2"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        feature_1=feature_1,
        feature_2=feature_2,
      )
      _, preds = torch.max(outputs, dim=1)

      probs = F.softmax(outputs, dim=1)

      contents.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(probs)
      real_values.extend(labels)

  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  return contents, predictions, prediction_probs, real_values

In [None]:
best_model_path = "./bert_model_baseline.bin"
def save_best_model(model):
  torch.save(model.state_dict(), best_model_path)

def load_best_model(model):
  model.load_state_dict(torch.load(best_model_path))
  return model

In [None]:
class EarlyStopping:
    def __init__(self, model, patience=1):
        self.patience = patience  # number of times to allow for no improvement before stopping the execution
        self.counter = 0  # count the number of times the validation accuracy not improving
        self.min_validation_loss = np.inf
        self.model = model
        self.max_acc = 0

    # return True when validation loss is not decreased by the `min_delta` for `patience` times
    def early_stop_check(self, val_acc):
        if (val_acc > self.max_acc):
            self.max_acc = val_acc
            self.counter = 0  # reset the counter if validation loss decreased at least by min_delta
            save_best_model(self.model)
        else:
            self.counter += 1 # increase the counter if validation loss is not decreased by the min_delta
            if self.counter >= self.patience:
                return True
        return False

In [None]:
def train_loop(df, fields:list[str], generate_dataloader):

  b_accuracy = 0
  epochs = 10
  n_fold = 10
  random_state = 42
  folds_acc = []
  histories = []


  k_fold = StratifiedKFold(n_splits=n_fold, random_state=random_state, shuffle=True)
  y = df.drop(["content", "difficulty"].extend(fields), axis=1)

  for i, (temp_index, test_index) in enumerate(k_fold.split(df, y)):
      print(" ------   Fold {}  ------- ".format(i+1), end="\n")
      df_temp = df.iloc[temp_index]
      df_train, df_val = train_test_split(df_temp, test_size=0.2, random_state=42)
      df_test = df.iloc[test_index]
      train_dataloader = generate_dataloader(df_train, tokenizer, max_len,batch_size)
      val_dataloader = generate_dataloader(df_val, tokenizer, max_len,batch_size)
      test_dataloader = generate_dataloader(df_test, tokenizer, max_len,batch_size)


      data = next(iter(train_dataloader))
      input_ids = data['input_ids'].to(device)
      attention_mask = data['attention_mask'].to(device)
      feature_1 = data['feature_1'].to(device)
      feature_2 = data['feature_2'].to(device)

      model = DiffClassifier(len(diff))
      # Running the classifier on GPU
      model = model.to(device)

      optimizer = AdamW(model.parameters(),
                    lr=2e-5,
                    weight_decay = 0.2,
                    correct_bias=False)
      scheduler = get_linear_schedule_with_warmup(optimizer,
                                                  num_warmup_steps=0.1,
                                                  num_training_steps=len(train_dataloader)*epochs)
      loss_fn = nn.CrossEntropyLoss().to(device)

      history = defaultdict(list)

      early_stopping = EarlyStopping(model, patience=4)

      for epoch in range(epochs):

        print(f'Epoch {epoch + 1}/{epochs}')
        print('-' * 20)

        train_acc, train_loss, train_preci = train(
              model,
              train_dataloader,
              optimizer,
              scheduler,
              loss_fn,
              df_train
            )
        print(f"Train : Loss {train_loss}, Accuracy : {train_acc*100:.2f} %, Precision : {train_preci}")

        history['train_acc'].append(train_acc)
        history['train_loss'].append(train_loss)
        history['train_precision'].append(train_preci)

        val_acc, val_loss, val_preci = eval(
              model,
              val_dataloader,
              loss_fn,
              device,
              len(df_val),
            )

        print(f'Val : Loss :{val_loss}, Accuracy : {val_acc*100:.2f} %, Precision : {val_preci}')
        print()

        history['val_acc'].append(val_acc)
        history['val_loss'].append(val_loss)
        history['val_precision'].append(val_preci)

        histories.append(history)

        if early_stopping.early_stop_check(val_acc):
          break

        # if val_acc > b_accuracy:
        #   torch.save(model.state_dict(), best_model_path)
        #   b_accuracy = val_acc

      model = load_best_model(model)
      y_contents, y_pred, y_pred_probs, y_test = testing(
            model,
            test_dataloader
          )
      acc = accuracy_score(y_test, y_pred)
      print("Test Accuracy for fold {}: {:.2f}%\n".format(i+1, acc*100))
      folds_acc.append(acc)
  return folds_acc, histories

English Language

In [None]:
df = df_en

In [None]:
mean_sen_tree_folds_acc, mean_sen_tree_histories =  train_loop(df, ["mean_sentence_len", "mean_tree_height"], generate_dataloader_1)

In [None]:
print("The accuracy score for mean sentence length and mean_tree_height features for English Language is: {:.3f}%".format(sum(mean_sen_tree_folds_acc) * 100 / len(mean_sen_tree_folds_acc)))

In [None]:
mean_sen_nounphrases_folds_acc, mean_sen_nounphrases_histories = train_loop(df, ["mean_sentence_len", "avg_nounphrases"], generate_dataloader_2)

In [None]:
print("The accuracy score for mean sentence length and avg_nounphrases features for English language is: {:.3f}%".format(sum(mean_sen_nounphrases_folds_acc) * 100 / len(mean_sen_nounphrases_folds_acc)))

In [None]:
avg_nounphrases_tree_folds_acc, avg_nounphrases_tree_histories =  train_loop(df, ["avg_nounphrases", "mean_tree_height"], generate_dataloader_3)

In [None]:
print("The accuracy score for avg_nounphrases and mean tree height features for English Language is: {:.3f}%".format(sum(avg_nounphrases_tree_folds_acc) * 100 / len(avg_nounphrases_tree_folds_acc)))

Spanish Language

In [None]:
df = df_es

In [None]:
mean_sen_tree_folds_acc, mean_sen_tree_histories =  train_loop(df, ["mean_sentence_len", "mean_tree_height"], generate_dataloader_1)

In [None]:
print("The accuracy score for mean sentence length and mean_tree_height features for Spanish Language is: {:.3f}%".format(sum(mean_sen_tree_folds_acc) * 100 / len(mean_sen_tree_folds_acc)))

In [None]:
mean_sen_nounphrases_folds_acc, mean_sen_nounphrases_histories = train_loop(df, ["mean_sentence_len", "avg_nounphrases"], generate_dataloader_2)

In [None]:
print("The accuracy score for mean sentence length and avg_nounphrases features for Spanish language is: {:.3f}%".format(sum(mean_sen_nounphrases_folds_acc) * 100 / len(mean_sen_nounphrases_folds_acc)))

In [None]:
avg_nounphrases_tree_folds_acc, avg_nounphrases_tree_histories =  train_loop(df, ["avg_nounphrases", "mean_tree_height"], generate_dataloader_3)

In [None]:
print("The accuracy score for avg_nounphrases and mean tree height features for Spanish Language is: {:.3f}%".format(sum(avg_nounphrases_tree_folds_acc) * 100 / len(avg_nounphrases_tree_folds_acc)))

French Language

In [None]:
df = df_fr

In [None]:
mean_sen_tree_folds_acc, mean_sen_tree_histories =  train_loop(df, ["mean_sentence_len", "mean_tree_height"], generate_dataloader_1)

In [None]:
print("The accuracy score for mean sentence length and mean_tree_height features for French Language is: {:.3f}%".format(sum(mean_sen_tree_folds_acc) * 100 / len(mean_sen_tree_folds_acc)))

In [None]:
mean_sen_nounphrases_folds_acc, mean_sen_nounphrases_histories = train_loop(df, ["mean_sentence_len", "avg_nounphrases"], generate_dataloader_2)

In [None]:
print("The accuracy score for mean sentence length and avg_nounphrases features for French language is: {:.3f}%".format(sum(mean_sen_nounphrases_folds_acc) * 100 / len(mean_sen_nounphrases_folds_acc)))

In [None]:
avg_nounphrases_tree_folds_acc, avg_nounphrases_tree_histories =  train_loop(df, ["avg_nounphrases", "mean_tree_height"], generate_dataloader_3)

In [None]:
print("The accuracy score for avg_nounphrases and mean tree height features for French Language is: {:.3f}%".format(sum(avg_nounphrases_tree_folds_acc) * 100 / len(avg_nounphrases_tree_folds_acc)))

Italian Language

In [None]:
df = df_it

In [None]:
mean_sen_tree_folds_acc, mean_sen_tree_histories =  train_loop(df, ["mean_sentence_len", "mean_tree_height"], generate_dataloader_1)

In [None]:
print("The accuracy score for mean sentence length and mean_tree_height features for Italian Language is: {:.3f}%".format(sum(mean_sen_tree_folds_acc) * 100 / len(mean_sen_tree_folds_acc)))

In [None]:
mean_sen_nounphrases_folds_acc, mean_sen_nounphrases_histories = train_loop(df, ["mean_sentence_len", "avg_nounphrases"], generate_dataloader_2)

In [None]:
print("The accuracy score for mean sentence length and avg_nounphrases features for Italian language is: {:.3f}%".format(sum(mean_sen_nounphrases_folds_acc) * 100 / len(mean_sen_nounphrases_folds_acc)))

In [None]:
avg_nounphrases_tree_folds_acc, avg_nounphrases_tree_histories =  train_loop(df, ["avg_nounphrases", "mean_tree_height"], generate_dataloader_3)

In [None]:
print("The accuracy score for avg_nounphrases and mean tree height features for Italian Language is: {:.3f}%".format(sum(avg_nounphrases_tree_folds_acc) * 100 / len(avg_nounphrases_tree_folds_acc)))

Basque Language

In [None]:
df = df_eu

In [None]:
mean_sen_tree_folds_acc, mean_sen_tree_histories =  train_loop(df, ["mean_sentence_len", "mean_tree_height"], generate_dataloader_1)

In [None]:
print("The accuracy score for mean sentence length and mean_tree_height features for Basque Language is: {:.3f}%".format(sum(mean_sen_tree_folds_acc) * 100 / len(mean_sen_tree_folds_acc)))

In [None]:
mean_sen_nounphrases_folds_acc, mean_sen_nounphrases_histories = train_loop(df, ["mean_sentence_len", "avg_nounphrases"], generate_dataloader_2)

In [None]:
print("The accuracy score for mean sentence length and avg_nounphrases features for Basque language is: {:.3f}%".format(sum(mean_sen_nounphrases_folds_acc) * 100 / len(mean_sen_nounphrases_folds_acc)))

In [None]:
avg_nounphrases_tree_folds_acc, avg_nounphrases_tree_histories =  train_loop(df, ["avg_nounphrases", "mean_tree_height"], generate_dataloader_3)

In [None]:
print("The accuracy score for avg_nounphrases and mean tree height features for Basque Language is: {:.3f}%".format(sum(avg_nounphrases_tree_folds_acc) * 100 / len(avg_nounphrases_tree_folds_acc)))

Catalan Language

In [None]:
df = df_ca

In [None]:
mean_sen_tree_folds_acc, mean_sen_tree_histories =  train_loop(df, ["mean_sentence_len", "mean_tree_height"], generate_dataloader_1)

In [None]:
print("The accuracy score for mean sentence length and mean_tree_height features for Catalan Language is: {:.3f}%".format(sum(mean_sen_tree_folds_acc) * 100 / len(mean_sen_tree_folds_acc)))

In [None]:
mean_sen_nounphrases_folds_acc, mean_sen_nounphrases_histories = train_loop(df, ["mean_sentence_len", "avg_nounphrases"], generate_dataloader_2)

In [None]:
print("The accuracy score for mean sentence length and avg_nounphrases features for Catalan language is: {:.3f}%".format(sum(mean_sen_nounphrases_folds_acc) * 100 / len(mean_sen_nounphrases_folds_acc)))

In [None]:
avg_nounphrases_tree_folds_acc, avg_nounphrases_tree_histories =  train_loop(df, ["avg_nounphrases", "mean_tree_height"], generate_dataloader_3)

In [None]:
print("The accuracy score for avg_nounphrases feature for Catalan print("The accuracy score for avg_nounphrases and mean tree heigprint("The accuracy score for avg_nounphrases and mean tree height features for Catalan Language is: {:.3f}%".format(sum(avg_nounphrases_tree_folds_acc) * 100 / len(avg_nounphrases_tree_folds_acc)))ht features for Basque Language is: {:.3f}%".format(sum(avg_nounphrases_tree_folds_acc) * 100 / len(avg_nounphrases_tree_folds_acc))) is: {:.3f}%".format(sum(avg_noun_folds_acc) * 100 / len(avg_noun_folds_acc)))