In [1]:
!pip install transformers
!pip install datasets



In [2]:
import random
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, Dataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F
from transformers import BertTokenizer,BertModel, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from collections import defaultdict

from sklearn.metrics import confusion_matrix, classification_report,precision_score,accuracy_score,f1_score
from sklearn.model_selection import StratifiedKFold

import pandas as pd
import spacy
import os
import sys

import logging
logging.basicConfig(level = logging.ERROR)


from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  from tqdm._tqdm_notebook import tqdm_notebook


In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(device)

cuda


In [5]:
def load_vikiwiki_dataset(path: str, label: str|int, difficulty: str, lang: str = "en") -> pd.DataFrame:
    cols = ["content", "labels", "difficulty"]
    result = []
    load_path = path
    if os.path.isdir(path):
        dir_file_list = os.listdir(path)
        dir_file_list.sort()
        if lang in dir_file_list:
            load_path = os.path.join(load_path, lang)
            dir_file_list = os.listdir(load_path)
            dir_file_list.sort()
        for file in dir_file_list:
            with open(os.path.join(load_path, file), "r") as f:
                data = "".join(f.readlines())
            if len(data) > 0:
                result.append([data, label, difficulty])
    return pd.DataFrame(
        data=result,
        columns=cols
    )

In [6]:
vikidia_path = "../input/vikiwiki/data/vikidia"
wikipedia_path = "../input/vikiwiki/data/wikipedia"
df_ca = pd.concat(
    [
        load_vikiwiki_dataset(vikidia_path, 0, "beginner", "ca"),
        load_vikiwiki_dataset(wikipedia_path, 1, "advanced", "ca")
    ],
    axis = 0,
    ignore_index = True
)

df_en = pd.concat(
    [
        load_vikiwiki_dataset(vikidia_path, 0, "beginner", "en"),
        load_vikiwiki_dataset(wikipedia_path, 1, "advanced", "en")
    ],
    axis = 0,
    ignore_index = True
)

df_es = pd.concat(
    [
        load_vikiwiki_dataset(vikidia_path, 0, "beginner", "es"),
        load_vikiwiki_dataset(wikipedia_path, 1, "advanced", "es")
    ],
    axis = 0,
    ignore_index = True
)

df_eu = pd.concat(
    [
        load_vikiwiki_dataset(vikidia_path, 0, "beginner", "eu"),
        load_vikiwiki_dataset(wikipedia_path, 1, "advanced", "eu")
    ],
    axis = 0,
    ignore_index = True
)

df_fr = pd.concat(
    [
        load_vikiwiki_dataset(vikidia_path, 0, "beginner", "fr"),
        load_vikiwiki_dataset(wikipedia_path, 1, "advanced", "fr")
    ],
    axis = 0,
    ignore_index = True
)

df_it = pd.concat(
    [
        load_vikiwiki_dataset(vikidia_path, 0, "beginner", "it"),
        load_vikiwiki_dataset(wikipedia_path, 1, "advanced", "it")
    ],
    axis = 0,
    ignore_index = True
)


In [7]:
df_en

Unnamed: 0,content,labels,difficulty
0,"The United States dollar (sign: $), also calle...",0,beginner
1,Édouard Manet (23 January 1832 – 30 April 1883...,0,beginner
2,Pi or π may refer to:\n,0,beginner
3,The 1800s were a time full of advancements in ...,0,beginner
4,2013 was a common year. It was followed by 201...,0,beginner
...,...,...,...
859,"World War II (WWII or WW2), also known as the ...",1,advanced
860,The Xbox 360 is a home video game console deve...,1,advanced
861,The Yangtze River (English pronunciation: /ˈjæ...,1,advanced
862,"Yugoslavia (Serbo-Croatian, Macedonian, Sloven...",1,advanced


In [8]:
df.labels.value_counts()


labels
0    429
1    391
Name: count, dtype: int64

Data Preprocessing

In [9]:
from transformers import BertTokenizer,BertModel, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup


In [10]:
class DiffDataset(Dataset):
    def __init__(self, contents, labels, tokenizer, max_len):
        self.contents = contents
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.contents)

    def __getitem__(self, item):
        content = str(self.contents[item])
        labels = self.labels[item]

        # Tokenizing the texts, while also including special tokens
        # for start and end of the text, as well as padding
        encoding = self.tokenizer.encode_plus(
          content,
          add_special_tokens=True,
          max_length=self.max_len,
          return_token_type_ids=False,
          pad_to_max_length=True,
          return_attention_mask=True,
          return_tensors='pt', # We return here the data as Pytorch Tensor
        )

        return {
          'content': content,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'labels': torch.tensor(labels, dtype=torch.long)
        }


In [11]:
def generate_dataloader(df, tokenizer, max_len, batch_size):
    ds = DiffDataset(
        contents=df.content.to_numpy(),
        labels=df.labels.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len,
      )
    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=2
      )


In [12]:
#model_name = 'bert-base-multilingual-uncased'
model_name = 'bert-base-uncased'

In [13]:
tokenizer = BertTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [14]:
max_len = 256
batch_size = 8

# train_dataloader = generate_dataloader(df_train, tokenizer, max_len,batch_size)
# val_dataloader = generate_dataloader(df_val, tokenizer, max_len,batch_size)
# test_dataloader = generate_dataloader(df_test, tokenizer, max_len,batch_size)

# data = next(iter(train_dataloader))



In [15]:
class DiffClassifier(nn.Module):

    def __init__(self, n_classes):
        super(DiffClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        # Adding drop out, keeping 90% of the last neurons of the raw BERT model
        #self.drop = nn.Dropout(p=0.1)
        # The last linear layer for multiclass classification
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    # Forward propagation function
    def forward(self, input_ids, attention_mask):
        model_outs = self.bert(
          input_ids=input_ids,
          attention_mask=attention_mask
        )
        last_hidden_state = model_outs[0]
        pooled_output = model_outs[1]
        return self.out(pooled_output)


In [16]:
len(diff)

2

In [17]:
model = DiffClassifier(len(diff))

# Running the classifier on GPU
model = model.to(device)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [18]:
# Computing multiclass precision for the outputs of the model
def compute_precision(outputs, labels):
    op = outputs.cpu()
    la = labels.cpu()
    _, preds = torch.max(op, dim=1)
    # We choose 'weighted' averaging of the precision of each label because it takes into account the imbalance of labels in our tweets dataset
    # other viable averaging methods are 'micro'
    return torch.tensor(precision_score(la, preds, average='weighted',zero_division=0))

In [19]:
def train(model, train_dataloader,optimizer,scheduler, loss_fn, df_train):

        # Reset tracking variables at the beginning of each epoch
        precision, correct_predictions, batch_counts = 0, 0, 0
        losses = []

        # Put the model into the training mode
        model.train()


        # For each batch of training data...
        for d in train_dataloader:
              batch_counts +=1
              # Load batch to GPU
              input_ids = d["input_ids"].to(device)
              attention_mask = d["attention_mask"].to(device)
              labels = d["labels"].to(device)

              outputs = model(input_ids=input_ids,attention_mask=attention_mask)

              _, preds = torch.max(outputs, dim=1)

              # Compute loss and accumulate the loss values

              loss = loss_fn(outputs, labels)

              correct_predictions += torch.sum(preds == labels)
              losses.append(loss.item())
              precision +=  compute_precision(outputs, labels)

              loss.backward()

              # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
              torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
              # Update parameters and the learning rate
              optimizer.step()
              scheduler.step()
              optimizer.zero_grad()
          # Accuracy, loss, precision
        return correct_predictions.double() / len(df_train), np.mean(losses), precision/batch_counts

In [20]:
def eval(model, valid_dataloader, loss_fn, device, n):
    model = model.eval()


    correct_predictions , precision ,batch_counts = 0,0,0
    losses = []

    with torch.no_grad():
        for d in valid_dataloader:
            batch_counts += 1

            # Preparing inputs
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)

            # Running inference using the model
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
              )

            # Running softmax on the outputs
            _, preds = torch.max(outputs, dim=1)

            # Computing loss function
            loss = loss_fn(outputs, labels)

            # Counting the correct occurences
            correct_predictions += torch.sum(preds == labels)

            # Computing the precision (true positives/true positives + false positives)
            # for each class and label, and find their average weighted by support
            precision += compute_precision(outputs,labels)

            losses.append(loss.item())
    # Accuracy, loss, precision
    return correct_predictions.double()/n, np.mean(losses), precision/batch_counts

In [21]:
def testing(model, dataloader):
    model = model.eval()

    contents = []
    predictions = []
    prediction_probs = []
    real_values = []

    with torch.no_grad():
        for d in dataloader:

            texts = d["content"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)

            outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)

            probs = F.softmax(outputs, dim=1)

            contents.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(probs)
            real_values.extend(labels)

    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()
    return contents, predictions, prediction_probs, real_values

In [22]:
best_model_path = "./bert_model_baseline.bin"
def save_best_model(model):
    torch.save(model.state_dict(), best_model_path)

def load_best_model(model):
    model.load_state_dict(torch.load(best_model_path))
    return model

In [23]:
class EarlyStopping:
    def __init__(self, model, patience=1):
        self.patience = patience  # number of times to allow for no improvement before stopping the execution
        self.counter = 0  # count the number of times the validation accuracy not improving
        self.min_validation_loss = np.inf
        self.model = model
        self.max_acc = 0

    # return True when validation loss is not decreased by the `min_delta` for `patience` times
    def early_stop_check(self, val_acc):
        if (val_acc > self.max_acc):
            self.max_acc = val_acc
            self.counter = 0  # reset the counter if validation loss decreased at least by min_delta
            save_best_model(model)
        else:
            self.counter += 1 # increase the counter if validation loss is not decreased by the min_delta
            if self.counter >= self.patience:
                return True
        return False

In [24]:
def train_loop(df: pd.DataFrame):

    b_accuracy = 0
    epochs = 10
    n_fold = 10
    random_state = 42
    folds_acc = []
    histories = []


    k_fold = StratifiedKFold(n_splits=n_fold, random_state=random_state, shuffle=True)
    y = df.drop(["content", "difficulty"], axis=1)

    for i, (temp_index, test_index) in enumerate(k_fold.split(df, y)):
        print(" ------   Fold {}  ------- ".format(i+1), end="\n")
        df_temp = df.iloc[temp_index]
        df_train, df_val = train_test_split(df_temp, test_size=0.2, random_state=42)
        df_test = df.iloc[test_index]
        train_dataloader = generate_dataloader(df_train, tokenizer, max_len,batch_size)
        val_dataloader = generate_dataloader(df_val, tokenizer, max_len,batch_size)
        test_dataloader = generate_dataloader(df_test, tokenizer, max_len,batch_size)


        data = next(iter(train_dataloader))
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)

        model = DiffClassifier(len(diff))
        # Running the classifier on GPU
        model = model.to(device)

        optimizer = AdamW(model.parameters(),
                    lr=2e-5,
                    weight_decay = 0.2,
                    correct_bias=False)
        scheduler = get_linear_schedule_with_warmup(optimizer,
                                                  num_warmup_steps=0.1,
                                                  num_training_steps=len(train_dataloader)*epochs)
        loss_fn = nn.CrossEntropyLoss().to(device)

        history = defaultdict(list)

        early_stopping = EarlyStopping(model, patience=4)

        for epoch in range(epochs):

            print(f'Epoch {epoch + 1}/{epochs}')
            print('-' * 20)

            train_acc, train_loss, train_preci = train(
                  model,
                  train_dataloader,
                  optimizer,
                  scheduler,
                  loss_fn,
                  df_train
                )
            print(f"Train : Loss {train_loss}, Accuracy : {train_acc*100:.2f} %, Precision : {train_preci}")

            history['train_acc'].append(train_acc)
            history['train_loss'].append(train_loss)
            history['train_precision'].append(train_preci)

            val_acc, val_loss, val_preci = eval(
                  model,
                  val_dataloader,
                  loss_fn,
                  device,
                  len(df_val),
                )

            print(f'Val : Loss :{val_loss}, Accuracy : {val_acc*100:.2f} %, Precision : {val_preci}')
            print()

            history['val_acc'].append(val_acc)
            history['val_loss'].append(val_loss)
            history['val_precision'].append(val_preci)

            histories.append(history)

            if early_stopping.early_stop_check(val_acc):
                break

        # if val_acc > b_accuracy:
        #   torch.save(model.state_dict(), best_model_path)
        #   b_accuracy = val_acc

        model = load_best_model(model)
        y_contents, y_pred, y_pred_probs, y_test = testing(
            model,
            test_dataloader
          )
        acc = accuracy_score(y_test, y_pred)
        print("Test Accuracy for fold {}: {:.2f}%\n".format(i+1, acc*100))
        folds_acc.append(acc)
    return folds_acc, histories


Basque Language

In [41]:
df = df_eu

In [None]:
folds_acc, folds_histories = train_loop(df)

In [49]:
print("The accuracy score for Basque language is: {:.3f}%".format(sum(folds_acc) * 100 / len(folds_acc)))

The accuracy score for Basque language is: 48.049%


Catalan Language

In [25]:
df = df_ca

In [None]:
folds_acc, folds_histories = train_loop(df)

In [27]:
print("The accuracy score for Catalan language is: {:.3f}%".format(sum(folds_acc) * 100 / len(folds_acc)))

The accuracy score for Catalan language is: 48.361%


English Language

In [28]:
df = df_en

In [None]:
folds_acc, folds_histories = train_loop(df)

In [30]:
print("The accuracy score for English language is: {:.3f}%".format(sum(folds_acc) * 100 / len(folds_acc)))

The accuracy score for English language is: 51.733%


French Language

In [31]:
df = df_fr

In [None]:
folds_acc, folds_histories = train_loop(df)

In [33]:
print("The accuracy score for French language is: {:.3f}%".format(sum(folds_acc) * 100 / len(folds_acc)))

The accuracy score for French language is: 43.384%


Italian Language

In [34]:
df = df_it

In [None]:
folds_acc, folds_histories = train_loop(df)

In [36]:
print("The accuracy score for Italian language is: {:.3f}%".format(sum(folds_acc) * 100 / len(folds_acc)))

The accuracy score for Italian language is: 48.216%


Spanish Language

In [37]:
df = df_es

In [None]:
folds_acc, folds_histories = train_loop(df)

In [39]:
print("The accuracy score for Italian language is: {:.3f}%".format(sum(folds_acc) * 100 / len(folds_acc)))

The accuracy score for Italian language is: 48.084%
