In [None]:
!pip install transformers
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 21.7 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 52.6 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 56.5 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.0 tokenizers-0.13.2 transformers-4.24.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K 

In [None]:
# Importing standard libraries for every machine/deep learning pipeline
import pandas as pd
import torch
from tqdm import tqdm, trange
import numpy as np


# Importing specific libraries for data prerpcessing, model archtecture choice, training and evaluation
from sklearn.model_selection import train_test_split
from keras_preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import CamembertTokenizer, CamembertForSequenceClassification
from transformers import AdamW

from sklearn.preprocessing import LabelEncoder

In [None]:
# Defining constants
epochs = 100
MAX_LEN = 128

batch_size = 16
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
df = pd.read_csv("training_data.csv")
df.head()

Unnamed: 0,id,sentence,difficulty
0,0,Les coûts kilométriques réels peuvent diverger...,C1
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,2,Le test de niveau en français est sur le site ...,A1
3,3,Est-ce que ton mari est aussi de Boston?,A1
4,4,"Dans les écoles de commerce, dans les couloirs...",B1


In [None]:
# Initialize CamemBERT tokenizer
tokenizer = CamembertTokenizer.from_pretrained('camembert/camembert-large', do_lower_case=True)

Downloading:   0%|          | 0.00/809k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456 [00:00<?, ?B/s]

In [None]:
# Creates list of texts and labels
text = df['sentence'].to_list()

labels = df['difficulty'].tolist()
le = LabelEncoder()
le.fit(["A1", "A2", "B1", "B2", "C1", "C2"])
labels = le.transform(labels)
labels = labels.tolist()


#user tokenizer to convert sentences into tokenizer
input_ids  = [tokenizer.encode(sent,add_special_tokens=True,max_length=MAX_LEN, truncation=True) for sent in text]

# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

# Create attention masks
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]  
    attention_masks.append(seq_mask)

In [None]:
# Use train_test_split to split our data into train and validation sets for training
train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks = train_test_split(input_ids, labels, attention_masks,
                                                            random_state=42, test_size=0.1)

# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 
model = CamembertForSequenceClassification.from_pretrained("camembert/camembert-large", num_labels=6)
model.to(device)

Downloading:   0%|          | 0.00/1.35G [00:00<?, ?B/s]

Some weights of the model checkpoint at camembert/camembert-large were not used when initializing CamembertForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert/camembert-large and are newly initialized: ['cl

CamembertForSequenceClassification(
  (roberta): CamembertModel(
    (embeddings): CamembertEmbeddings(
      (word_embeddings): Embedding(32005, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): CamembertEncoder(
      (layer): ModuleList(
        (0): CamembertLayer(
          (attention): CamembertAttention(
            (self): CamembertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): CamembertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, eps=10e-8)

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)



In [None]:
# Store our loss and accuracy for plotting if we want to visualize training evolution per epochs after the training process
train_loss_set = []

# trange is a tqdm wrapper around the normal python range
epochnum = 0
for _ in trange(epochs, desc="Epoch"):  
    epochnum += 1
    # Tracking variables for training
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
  
    # Train the model
    model.train()
    for step, batch in enumerate(train_dataloader):
        # Add batch to device CPU or GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()
        # Forward pass
        outputs = model(b_input_ids,token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        # Get loss value
        loss = outputs[0]
        # Add it to train loss list
        train_loss_set.append(loss.item())    
        # Backward pass
        loss.backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()
    
        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
    


    # Tracking variables for validation
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    # Validation of the model
    model.eval()
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        # Add batch to device CPU or GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Telling the model not to compute or store gradients, saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            outputs =  model(b_input_ids,token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            loss, logits = outputs[:2]
    
        # Move logits and labels to CPU if GPU is used
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    torch.save(model.state_dict(), 'checkpoint' + str(epochnum) + '.pth')

Epoch:   0%|          | 0/100 [00:00<?, ?it/s]

Train loss: 1.2674722684754265
Validation Accuracy: 0.5520833333333334


Epoch:   1%|          | 1/100 [05:20<8:48:25, 320.26s/it]

Train loss: 0.9352231672516576
Validation Accuracy: 0.6020833333333333


Epoch:   2%|▏         | 2/100 [10:40<8:43:10, 320.32s/it]

Train loss: 0.6653169386916691
Validation Accuracy: 0.59375


Epoch:   3%|▎         | 3/100 [16:00<8:37:51, 320.32s/it]

Train loss: 0.44411641701504034
Validation Accuracy: 0.45


Epoch:   4%|▍         | 4/100 [21:21<8:32:42, 320.45s/it]

Train loss: 0.2766819230246323
Validation Accuracy: 0.6104166666666667


Epoch:   5%|▌         | 5/100 [26:41<8:27:16, 320.38s/it]

Train loss: 0.19474605040417778
Validation Accuracy: 0.5604166666666667


Epoch:   6%|▌         | 6/100 [32:02<8:21:54, 320.36s/it]

Train loss: 0.14424367009627598
Validation Accuracy: 0.5875


Epoch:   7%|▋         | 7/100 [37:22<8:16:37, 320.40s/it]

Train loss: 0.08954063360406844
Validation Accuracy: 0.6


Epoch:   8%|▊         | 8/100 [42:43<8:11:21, 320.46s/it]

Train loss: 0.06555163916200399
Validation Accuracy: 0.6125


Epoch:   9%|▉         | 9/100 [48:03<8:05:59, 320.43s/it]

Train loss: 0.13042561433415997
Validation Accuracy: 0.6083333333333333


Epoch:  10%|█         | 10/100 [53:24<8:00:45, 320.50s/it]

Train loss: 0.09228395626587034
Validation Accuracy: 0.63125


Epoch:  11%|█         | 11/100 [58:44<7:55:20, 320.45s/it]

Train loss: 0.08002600100201865
Validation Accuracy: 0.5416666666666666


Epoch:  12%|█▏        | 12/100 [1:04:05<7:49:59, 320.45s/it]

Train loss: 0.0776234399932609
Validation Accuracy: 0.6083333333333333


Epoch:  13%|█▎        | 13/100 [1:09:25<7:44:36, 320.42s/it]

Train loss: 0.06493195976061678
Validation Accuracy: 0.5854166666666667


Epoch:  14%|█▍        | 14/100 [1:14:46<7:39:30, 320.58s/it]

Train loss: 0.04121222818071989
Validation Accuracy: 0.6083333333333333


Epoch:  15%|█▌        | 15/100 [1:20:07<7:34:25, 320.76s/it]

Train loss: 0.03235797016510602
Validation Accuracy: 0.6125


Epoch:  16%|█▌        | 16/100 [1:25:28<7:29:11, 320.85s/it]

Train loss: 0.03693550386060788
Validation Accuracy: 0.60625


Epoch:  17%|█▋        | 17/100 [1:30:48<7:23:37, 320.69s/it]

Train loss: 0.06061439775016711
Validation Accuracy: 0.6


Epoch:  18%|█▊        | 18/100 [1:36:09<7:18:07, 320.57s/it]

Train loss: 0.04274237992983587
Validation Accuracy: 0.6208333333333333


Epoch:  19%|█▉        | 19/100 [1:41:29<7:12:39, 320.49s/it]

Train loss: 0.05545616840956629
Validation Accuracy: 0.5770833333333333


Epoch:  19%|█▉        | 19/100 [1:46:49<7:35:25, 337.35s/it]
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/serialization.py", line 379, in save
    _save(obj, opened_zipfile, pickle_module, pickle_protocol)
  File "/usr/local/lib/python3.7/dist-packages/torch/serialization.py", line 604, in _save
    zip_file.write_record(name, storage.data_ptr(), num_bytes)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 3326, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-60f901632edc>", line 69, in <module>
    torch.save(model.state_dict(), 'checkpoint' + str(epochnum) + '.pth')
  File "/usr/local/lib/python3.7/dist-packages/torch/serialization.py", line 380, in save
    return
  File "/usr/local/lib/python3.7/dist-packages/torch/serialization.py", line 259, in __exit__
    self.file_like.write_end_of_file()
Runti

TypeError: ignored

In [None]:
model.load_state_dict(torch.load("checkpoint11.pth"))

<All keys matched successfully>

In [None]:
dfValid = pd.read_csv("unlabelled_test_data.csv")
validText = dfValid['sentence'].to_list()

In [None]:
comments = validText

# Encode the comments
tokenized_comments_ids = [tokenizer.encode(comment,add_special_tokens=True,max_length=MAX_LEN, truncation=True) for comment in comments]
# Pad the resulted encoded comments
tokenized_comments_ids = pad_sequences(tokenized_comments_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

# Create attention masks 
attention_masks = []
for seq in tokenized_comments_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

prediction_inputs = torch.tensor(tokenized_comments_ids)
prediction_masks = torch.tensor(attention_masks)

In [None]:
# Apply the finetuned model (Camembert)
flat_pred = []
with torch.no_grad():
    # Forward pass, calculate logit predictions
    outputs =  model(prediction_inputs.to(device),token_type_ids=None, attention_mask=prediction_masks.to(device))
    print(outputs)
    logits = outputs[0]
    logits = logits.detach().cpu().numpy() 
    flat_pred.extend(np.argmax(logits, axis=1).flatten())

SequenceClassifierOutput(loss=None, logits=tensor([[-1.0497, -0.4545, -1.5173, -2.3823, -1.1842,  6.7913],
        [-1.1060,  3.1155,  5.7262, -3.0251, -3.1606, -2.2346],
        [-2.5430, -2.0759,  2.5528,  4.6356, -1.0054, -1.6017],
        ...,
        [-1.1695, -1.0624, -1.7832, -2.3436, -0.2081,  7.0608],
        [-2.3085, -1.6669,  0.4514,  6.6299, -1.9904, -1.0187],
        [-2.4933, -2.1331, -0.5798,  6.7125,  0.5588, -1.3704]],
       device='cuda:0'), hidden_states=None, attentions=None)


In [None]:
#def softmax(x):
#    return(np.exp(x - np.max(x)) / np.exp(x - np.max(x)).sum())
#print(np.asarray(logits))
from scipy.special import softmax
outSoft = softmax(np.asarray(logits), axis=1)
#print(sum(outSoft[0]))
print(outSoft)


[[3.92586022e-04 7.11910601e-04 2.45963252e-04 1.03558188e-04
  3.43161897e-04 9.98203039e-01]
 [1.00305397e-03 6.83419555e-02 9.30054903e-01 1.47187267e-04
  1.28539381e-04 3.24461318e-04]
 [6.73764793e-04 1.07493356e-03 1.10045485e-01 8.83343577e-01
  3.13537940e-03 1.72697683e-03]
 ...
 [2.66042771e-04 2.96132843e-04 1.44028003e-04 8.22394722e-05
  6.95810595e-04 9.98515785e-01]
 [1.30842658e-04 2.48529570e-04 2.06707232e-03 9.96898711e-01
  1.79842624e-04 4.75234643e-04]
 [1.00114492e-04 1.43536279e-04 6.78447192e-04 9.96651530e-01
  2.11848225e-03 3.07751819e-04]]


In [None]:
outLabel = [["A1", "A2", "B1", "B2", "C1", "C2"][cl] for cl in flat_pred]
outLabel

['C2',
 'B1',
 'B2',
 'A1',
 'C2',
 'C1',
 'A2',
 'A2',
 'B2',
 'A2',
 'A1',
 'A2',
 'B2',
 'C1',
 'A1',
 'A2',
 'B2',
 'A1',
 'A1',
 'A1',
 'C2',
 'B2',
 'C1',
 'C1',
 'A2',
 'C2',
 'A1',
 'A1',
 'C2',
 'B1',
 'A1',
 'A2',
 'A1',
 'A2',
 'A2',
 'A2',
 'C1',
 'B1',
 'A1',
 'A1',
 'B1',
 'B2',
 'C2',
 'C1',
 'B2',
 'C1',
 'B2',
 'C2',
 'A1',
 'A1',
 'C1',
 'A1',
 'B2',
 'A1',
 'A1',
 'C1',
 'C1',
 'B2',
 'C1',
 'B1',
 'B2',
 'A2',
 'C2',
 'C1',
 'C2',
 'B2',
 'C1',
 'A2',
 'B2',
 'B2',
 'A1',
 'B2',
 'C1',
 'B1',
 'A2',
 'A1',
 'B2',
 'C1',
 'A2',
 'C1',
 'A2',
 'A2',
 'B2',
 'A2',
 'A2',
 'B2',
 'B1',
 'C2',
 'C2',
 'B1',
 'C2',
 'A1',
 'C1',
 'B2',
 'B1',
 'A2',
 'A2',
 'A2',
 'A1',
 'C2',
 'C2',
 'C1',
 'A2',
 'A2',
 'C2',
 'C2',
 'B2',
 'A1',
 'A2',
 'C1',
 'C1',
 'A1',
 'C1',
 'C1',
 'C2',
 'B2',
 'C2',
 'A2',
 'A2',
 'C1',
 'A1',
 'A2',
 'B2',
 'C1',
 'A2',
 'C1',
 'C1',
 'C1',
 'C1',
 'C1',
 'A1',
 'A1',
 'C1',
 'C2',
 'C2',
 'C2',
 'A2',
 'C1',
 'A2',
 'A2',
 'A1',
 'C1',
 'C2',

In [None]:
from google.colab import files
ids = [i for i in range(len(outLabel))]
dfout = pd.DataFrame({'id': ids, 'difficulty': outLabel})
print(dfout.head())
dfout.to_csv('test3.csv', index=False)
files.download('test3.csv')

   id difficulty
0   0         C2
1   1         B1
2   2         B2
3   3         A1
4   4         C2


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
dfValid = pd.read_csv("unlabelled_test_data.csv")
validText = dfValid['sentence'].to_list()

comments = validText

# Encode the comments
tokenized_comments_ids = [tokenizer.encode(comment,add_special_tokens=True,max_length=MAX_LEN, truncation=True) for comment in comments]
# Pad the resulted encoded comments
tokenized_comments_ids = pad_sequences(tokenized_comments_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

# Create attention masks 
attention_masks = []
for seq in tokenized_comments_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

prediction_inputs = torch.tensor(tokenized_comments_ids)
prediction_masks = torch.tensor(attention_masks)


# Apply the finetuned model (Camembert)
flat_pred = []
with torch.no_grad():
    # Forward pass, calculate logit predictions
    outputs =  model(prediction_inputs.to(device),token_type_ids=None, attention_mask=prediction_masks.to(device))
    #print(outputs)
    logits = outputs[0]
    logits = logits.detach().cpu().numpy() 
    flat_pred.extend(np.argmax(logits, axis=1).flatten())


from scipy.special import softmax
outSoft = softmax(np.asarray(logits), axis=1)
#print(sum(outSoft[0]))
#print(outSoft)
#outSoft = [str(conf) for conf in outSoft]
out_A1 = [str(conf[0]) for conf in outSoft]
out_A2 = [str(conf[1]) for conf in outSoft]
out_B1 = [str(conf[2]) for conf in outSoft]
out_B2 = [str(conf[3]) for conf in outSoft]
out_C1 = [str(conf[4]) for conf in outSoft]
out_C2 = [str(conf[5]) for conf in outSoft]

outLabel = [["A1", "A2", "B1", "B2", "C1", "C2"][cl] for cl in flat_pred]


from google.colab import files
ids = [i for i in range(len(outLabel))]
#dfout = pd.DataFrame({'id': ids, 'difficulty': outLabel, 'confidence': outSoft})
dfout = pd.DataFrame({'id': ids, 'difficulty': outLabel, 
                      'A1': out_A1, 'A2': out_A2,
                      'B1': out_B1, 'B2': out_B2,
                      'C1': out_C1, 'C2': out_C2,})
print(dfout.head())
dfout.to_csv('camembert_test0.csv', index=False)
files.download('camembert_test0.csv')


   id difficulty             A1             A2             B1             B2  \
0   0         C2  0.00039258602   0.0007119106  0.00024596325  0.00010355819   
1   1         B1    0.001003054    0.068341956      0.9300549  0.00014718727   
2   2         B2   0.0006737648   0.0010749336    0.110045485      0.8833436   
3   3         A1      0.9945613   0.0015855824   0.0031399697  0.00028688795   
4   4         C2  0.00019194018  0.00021344156  0.00013326028   0.0001030289   

              C1             C2  
0   0.0003431619     0.99820304  
1  0.00012853938  0.00032446132  
2   0.0031353794   0.0017269768  
3  0.00013045268  0.00029576645  
4    0.001194173      0.9981645  


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>