In [1]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import random
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, Dataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F
from transformers import BertTokenizer,BertModel, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from collections import defaultdict

from sklearn.metrics import confusion_matrix, classification_report,precision_score,accuracy_score,f1_score

import logging 
logging.basicConfig(level = logging.ERROR)



In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
from datasets import load_dataset

dataset = load_dataset("onestop_english")




  0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(device)

cuda


In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 567
    })
})

In [7]:
import pandas as pd

df = pd.DataFrame(dataset['train'])
df.columns = ['content', 'labels']


In [8]:
df

Unnamed: 0,content,labels
0,"When you see the word Amazon, what’s the first...",0
1,"To tourists, Amsterdam still seems very libera...",0
2,"Anitta, a music star from Brazil, has millions...",0
3,Google has made maps of the world’s highest mo...,0
4,The auction of a Banksy painting that disappea...,0
...,...,...
562,"In typical bad-boyfriend style, Dan Sullivan w...",2
563,Thousands of people protested on Australia’s b...,2
564,1 Race engineer \nA race engineer liaises betw...,2
565,More than one million British workers might be...,2


In [9]:
df.labels.value_counts()


0    189
1    189
2    189
Name: labels, dtype: int64

In [10]:
num_diff = df.labels.unique()
diff = ['beginner','intermediate','advanced']

label = dict(zip(num_diff, diff))

diff = list(diff)


In [11]:
df['Difficulty'] = df.labels.replace(label)
df.head()



Unnamed: 0,content,labels,Difficulty
0,"When you see the word Amazon, what’s the first...",0,beginner
1,"To tourists, Amsterdam still seems very libera...",0,beginner
2,"Anitta, a music star from Brazil, has millions...",0,beginner
3,Google has made maps of the world’s highest mo...,0,beginner
4,The auction of a Banksy painting that disappea...,0,beginner


Data Preprocessing

In [12]:
import re

In [13]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",

                           "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",

                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",

                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",

                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",

                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",

                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",

                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",

                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",

                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",

                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",

                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",

                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",

                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",

                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",

                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",

                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",

                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",

                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",

                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",

                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",

                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",

                           "you're": "you are", "you've": "you have"}

In [14]:

def clean_text(text):
    #text=' '.join([contraction_mapping[i] if i in contraction_mapping.keys() else i for i in text.split()])
    text=re.sub("'s","",text)
    if text.split()[0] == 'Intermediate':
      text = ' '.join(text.split()[1:])
    text=' '.join([i for i in text.split() if i.isalpha()])
    text=re.sub('[^a-zA-Z]'," ",text)

    
    return text

In [15]:
df['content'] = df['content'].apply(clean_text)


In [16]:
df[df['Difficulty']=='intermediate']


Unnamed: 0,content,labels,Difficulty
189,When you see the word whats the first thing yo...,1,intermediate
190,To Amsterdam still seems very Recently the cit...,1,intermediate
191,Brazils latest funk has won millions of fans b...,1,intermediate
192,It has mapped the worlds highest the ocean the...,1,intermediate
193,The controversial auction of a Banksy mural th...,1,intermediate
...,...,...,...
373,In typical Dan Sullivan was late to breakfast ...,1,intermediate
374,Thousands of people protested on Australias be...,1,intermediate
375,Race engineer A race engineer liaises between ...,1,intermediate
376,More than one million British workers might be...,1,intermediate


In [17]:
df

Unnamed: 0,content,labels,Difficulty
0,When you see the word the first thing you thin...,0,beginner
1,To Amsterdam still seems very Recently the May...,0,beginner
2,a music star from has millions of but she is a...,0,beginner
3,Google has made maps of the highest the ocean ...,0,beginner
4,The auction of a Banksy painting that disappea...,0,beginner
...,...,...,...
562,In typical Dan Sullivan was late to breakfast ...,2,advanced
563,Thousands of people protested on beaches again...,2,advanced
564,Race engineer A race engineer liaises between ...,2,advanced
565,More than one million British workers might be...,2,advanced


sentence lenth

In [18]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [19]:
sen_len = []
for i in df['content']:
  curr_sen_len = []
  for j in sent_tokenize(i):
    curr_sen_len.append(len(word_tokenize(j)))
  mean_len = np.mean(curr_sen_len)
  sen_len.append(mean_len)

In [20]:
sen_len[400:410]

[864.0, 661.0, 686.0, 830.0, 671.0, 665.0, 787.0, 842.0, 565.0, 733.0]

In [21]:
df['mean_sentence_len'] = sen_len

In [22]:
df

Unnamed: 0,content,labels,Difficulty,mean_sentence_len
0,When you see the word the first thing you thin...,0,beginner,351.0
1,To Amsterdam still seems very Recently the May...,0,beginner,380.0
2,a music star from has millions of but she is a...,0,beginner,387.0
3,Google has made maps of the highest the ocean ...,0,beginner,432.0
4,The auction of a Banksy painting that disappea...,0,beginner,371.0
...,...,...,...,...
562,In typical Dan Sullivan was late to breakfast ...,2,advanced,445.0
563,Thousands of people protested on beaches again...,2,advanced,516.0
564,Race engineer A race engineer liaises between ...,2,advanced,704.0
565,More than one million British workers might be...,2,advanced,684.0


In [23]:
from transformers import BertTokenizer,BertModel, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup


In [24]:
class DiffDataset(Dataset):
    def __init__(self, contents, labels, features, tokenizer, max_len):
        self.contents = contents
        self.labels = labels
        self.tokenizer = tokenizer
        self.features = features
        self.max_len = max_len
  
    def __len__(self):
        return len(self.contents)
  
    def __getitem__(self, item):
        content = str(self.contents[item])
        labels = self.labels[item]
        features = self.features[item]
        # Tokenizing the texts, while also including special tokens 
        # for start and end of the text, as well as padding
        encoding = self.tokenizer.encode_plus(
          content,
          add_special_tokens=True,
          max_length=self.max_len,
          return_token_type_ids=False,
          pad_to_max_length=True,
          return_attention_mask=True,
          return_tensors='pt', # We return here the data as Pytorch Tensor
        )

        return {
          'content': content,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'labels': torch.tensor(labels, dtype=torch.long), 
          'features': torch.tensor([features], dtype=torch.long)
        }


In [25]:
torch.tensor([[12, 23, 45]], dtype=torch.long).shape

torch.Size([1, 3])

In [26]:
df_temp, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_temp, test_size=0.1, random_state=42)

print("size of training set : " + str(df_train.shape[0]))
print("size of validation set : " + str(df_val.shape[0]))
print("size of test set : " + str(df_test.shape[0]))



size of training set : 407
size of validation set : 46
size of test set : 114


In [27]:
def generate_dataloader(df, tokenizer, max_len, batch_size):
    ds = DiffDataset(
        contents=df.content.to_numpy(),
        labels=df.labels.to_numpy(),
        features = df.mean_sentence_len.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len,
      )
    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=2
      )


In [28]:
#model_name = 'bert-base-multilingual-uncased'
model_name = 'bert-base-uncased'

In [29]:
tokenizer = BertTokenizer.from_pretrained(model_name)

In [30]:
max_len = 256 
batch_size = 8

train_dataloader = generate_dataloader(df_train, tokenizer, max_len,batch_size)
val_dataloader = generate_dataloader(df_val, tokenizer, max_len,batch_size)
test_dataloader = generate_dataloader(df_test, tokenizer, max_len,batch_size)

data = next(iter(train_dataloader))



Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [31]:
class DiffClassifier(nn.Module):

    def __init__(self, n_classes):
        super(DiffClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        # Adding drop out, keeping 90% of the last neurons of the raw BERT model
        #self.drop = nn.Dropout(p=0.1)
        # The last linear layer for multiclass classification
        self.out = nn.Linear(self.bert.config.hidden_size+1, n_classes)
  
    # Forward propagation function
    def forward(self, input_ids, attention_mask, features):
        model_outs = self.bert(
          input_ids=input_ids,
          attention_mask=attention_mask
        )
        last_hidden_state = model_outs[0]
        pooled_output = model_outs[1]

        x = torch.cat((pooled_output, features) , dim=1)

        return self.out(x)


In [32]:
len(diff)

3

In [56]:
model = DiffClassifier(len(diff))

# Running the classifier on GPU
model = model.to(device)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [34]:
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)
features = data['features'].to(device)
print(input_ids.shape)
print(attention_mask.shape)
print(features.shape)

torch.Size([8, 256])
torch.Size([8, 256])
torch.Size([8, 1])


In [35]:
features

tensor([[634],
        [555],
        [695],
        [675],
        [704],
        [834],
        [558],
        [348]], device='cuda:0')

In [59]:
optimizer = AdamW(model.parameters(),
                  lr=2e-5, 
                  weight_decay = 0.2,
                  correct_bias=False)
epochs = 10
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0.1,
                                            num_training_steps=len(train_dataloader)*epochs)
loss_fn = nn.CrossEntropyLoss().to(device)


In [37]:
# Computing multiclass precision for the outputs of the model
def compute_precision(outputs, labels):
  op = outputs.cpu()
  la = labels.cpu()
  _, preds = torch.max(op, dim=1)
  # We choose 'weighted' averaging of the precision of each label because it takes into account the imbalance of labels in our tweets dataset
  # other viable averaging methods are 'micro'
  return torch.tensor(precision_score(la, preds, average='weighted',zero_division=0))

In [38]:
def train(model, train_dataloader,optimizer,scheduler):

        # Reset tracking variables at the beginning of each epoch
        precision, correct_predictions, batch_counts = 0, 0, 0
        losses = []

        # Put the model into the training mode
        model.train()

  
        # For each batch of training data...
        for d in train_dataloader:
              batch_counts +=1
              # Load batch to GPU
              input_ids = d["input_ids"].to(device)
              attention_mask = d["attention_mask"].to(device)
              labels = d["labels"].to(device)
              features = d["features"].to(device)

              outputs = model(input_ids=input_ids,attention_mask=attention_mask, features=features)
              
              _, preds = torch.max(outputs, dim=1)
              
              # Compute loss and accumulate the loss values

              loss = loss_fn(outputs, labels)

              correct_predictions += torch.sum(preds == labels)
              losses.append(loss.item())
              precision +=  compute_precision(outputs, labels)
            
              loss.backward()

              # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
              torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
              # Update parameters and the learning rate  
              optimizer.step()
              scheduler.step()
              optimizer.zero_grad()
          # Accuracy, loss, precision
        return correct_predictions.double() / len(df_train), np.mean(losses), precision/batch_counts

In [39]:
def eval(model, valid_dataloader, loss_fn, device, n):
    model = model.eval()


    correct_predictions , precision ,batch_counts = 0,0,0
    losses = []

    with torch.no_grad():
        for d in valid_dataloader:
            batch_counts += 1

            # Preparing inputs
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)
            features = d["features"].to(device)

            # Running inference using the model
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                features=features
              )
            
            # Running softmax on the outputs
            _, preds = torch.max(outputs, dim=1)

            # Computing loss function
            loss = loss_fn(outputs, labels)

            # Counting the correct occurences
            correct_predictions += torch.sum(preds == labels)

            # Computing the precision (true positives/true positives + false positives) 
            # for each class and label, and find their average weighted by support 
            precision += compute_precision(outputs,labels)

            losses.append(loss.item())
    # Accuracy, loss, precision 
    return correct_predictions.double()/n, np.mean(losses), precision/batch_counts 

In [60]:
history = defaultdict(list)
b_accuracy = 0

for epoch in range(epochs):
  
  print(f'Epoch {epoch + 1}/{epochs}')
  print('-' * 20)

  train_acc, train_loss, train_preci = train(
        model,
        train_dataloader,    
        optimizer, 
        scheduler,
      )
  print(f"Train : Loss {train_loss}, Accuracy : {train_acc*100:.2f} %, Precision : {train_preci}")

  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['train_precision'].append(train_preci)

  val_acc, val_loss, val_preci = eval(
        model,
        val_dataloader,
        loss_fn, 
        device,
        len(df_val),
      )

  print(f'Val : Loss :{val_loss}, Accuracy : {val_acc*100:.2f} %, Precision : {val_preci}')  
  print()

  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)
  history['val_precision'].append(val_preci)
  
  if val_acc > b_accuracy:
    torch.save(model.state_dict(), 'bert_model_baseline.bin')
    b_accuracy = val_acc

Epoch 1/10
--------------------


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Train : Loss 6.791305107228896, Accuracy : 36.86 %, Precision : 0.22496498599439774


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Val : Loss :1.8687758445739746, Accuracy : 21.74 %, Precision : 0.16643518518518519

Epoch 2/10
--------------------


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Train : Loss 1.9958966818510318, Accuracy : 42.01 %, Precision : 0.35763597105508876


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Val : Loss :1.5672621528307598, Accuracy : 52.17 %, Precision : 0.5304232804232805

Epoch 3/10
--------------------


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Train : Loss 1.244647739856851, Accuracy : 56.27 %, Precision : 0.6169351073762839


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Val : Loss :0.5302216311295828, Accuracy : 86.96 %, Precision : 0.8625000000000002

Epoch 4/10
--------------------


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Train : Loss 0.6539925632304421, Accuracy : 81.33 %, Precision : 0.8487336601307186


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Val : Loss :0.5726835379997889, Accuracy : 91.30 %, Precision : 0.9027777777777778

Epoch 5/10
--------------------


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


KeyboardInterrupt: ignored

In [61]:
history['train_acc']

[tensor(0.3686, device='cuda:0', dtype=torch.float64),
 tensor(0.4201, device='cuda:0', dtype=torch.float64),
 tensor(0.5627, device='cuda:0', dtype=torch.float64),
 tensor(0.8133, device='cuda:0', dtype=torch.float64)]

In [62]:
history['val_acc']

[tensor(0.2174, device='cuda:0', dtype=torch.float64),
 tensor(0.5217, device='cuda:0', dtype=torch.float64),
 tensor(0.8696, device='cuda:0', dtype=torch.float64),
 tensor(0.9130, device='cuda:0', dtype=torch.float64)]

In [43]:
def testing(model, dataloader):
  model = model.eval()
  
  contents = []
  predictions = []
  prediction_probs = []
  real_values = []

  with torch.no_grad():
    for d in dataloader:

      texts = d["content"]
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      labels = d["labels"].to(device)
      features = d["features"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask, 
        features=features
      )
      _, preds = torch.max(outputs, dim=1)

      probs = F.softmax(outputs, dim=1)

      contents.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(probs)
      real_values.extend(labels)

  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  return contents, predictions, prediction_probs, real_values

In [63]:
y_contents, y_pred, y_pred_probs, y_test = testing(
  model,
  test_dataloader
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [64]:
print(classification_report(y_test, y_pred, target_names=diff))

              precision    recall  f1-score   support

    beginner       0.93      0.91      0.92        43
intermediate       0.89      0.84      0.86        38
    advanced       0.78      0.85      0.81        33

    accuracy                           0.87       114
   macro avg       0.87      0.87      0.86       114
weighted avg       0.87      0.87      0.87       114



In [54]:
print(classification_report(y_test, y_pred, target_names=diff))

              precision    recall  f1-score   support

    beginner       0.84      0.60      0.70        43
intermediate       0.49      0.89      0.63        38
    advanced       0.54      0.21      0.30        33

    accuracy                           0.59       114
   macro avg       0.62      0.57      0.55       114
weighted avg       0.63      0.59      0.56       114



In [65]:
y_pred_probs

tensor([[2.6121e-03, 4.9177e-03, 9.9247e-01],
        [6.0405e-03, 3.5213e-04, 9.9361e-01],
        [2.3634e-08, 1.0000e+00, 3.5735e-07],
        [4.1365e-03, 1.5921e-02, 9.7994e-01],
        [4.9305e-03, 1.0249e-03, 9.9404e-01],
        [5.7801e-10, 1.0000e+00, 1.7950e-08],
        [1.5552e-08, 1.0000e+00, 2.6611e-07],
        [1.0964e-03, 6.3690e-02, 9.3521e-01],
        [1.4420e-03, 3.2691e-02, 9.6587e-01],
        [1.7847e-09, 1.0000e+00, 3.0358e-08],
        [9.7846e-01, 1.6841e-03, 1.9852e-02],
        [8.1820e-01, 3.7795e-02, 1.4401e-01],
        [4.1375e-07, 1.0000e+00, 4.3430e-06],
        [1.7370e-08, 1.0000e+00, 2.3882e-07],
        [6.9046e-03, 2.0564e-01, 7.8746e-01],
        [9.7813e-01, 9.3726e-04, 2.0937e-02],
        [9.0648e-06, 9.9995e-01, 3.7332e-05],
        [5.6625e-02, 1.0848e-02, 9.3253e-01],
        [2.1496e-03, 6.3603e-01, 3.6182e-01],
        [9.5537e-01, 1.8412e-03, 4.2788e-02],
        [9.5649e-01, 1.8540e-02, 2.4967e-02],
        [9.6156e-01, 9.1170e-03, 2

In [66]:
out_df = pd.DataFrame()
out_df['content'] = y_contents
out_df['true_label'] = y_test
out_df['pred_label'] = y_pred

In [67]:
out_df

Unnamed: 0,content,true_label,pred_label
0,What is it like to look at the very last of To...,2,2
1,SeaWorld has suffered an collapse in profits a...,2,2
2,On the market square in Rjukan stands a statue...,1,1
3,The world shares him and London claims him but...,2,2
4,The regulation eight hours in the office is Th...,2,2
...,...,...,...
109,Some cities have Lima has black They fly in gr...,0,0
110,David Cameron has declared a in the Scottish i...,2,2
111,Clay Cockrell is sitting in his of ce across t...,1,1
112,Police and intelligence agencies around the wo...,0,0


In [68]:
out_df.to_csv('out_BERT_256_simple features.csv')