In [1]:
# Execute this cell only if you want to synchronise Google Drive. Otherwise, enter the path to your file below.
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    COLAB = True
    print('Note: using Google CoLab')
    path_files = '/content/drive/Othercomputers/Mon ordinateur portable/CS/SM11/NLP/Ponchon_Deneve_Setrouk_Demy/'
except:
    print('Note: not using Google CoLab')
    COLAB = False

Mounted at /content/drive
Note: using Google CoLab


In [2]:
!pip install -r '/content/drive/Othercomputers/Mon ordinateur portable/CS/SM11/NLP/Ponchon_Deneve_Setrouk_Demy/requirements.txt'

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import re
import torch
import pandas as pd
import numpy as np
from typing import List
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

import nltk
nltk.download('wordnet')
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def clean_sentences(df):
    clean_df = df.copy()

    for column in ['target', 'sentence']:
        # Sentence and target to lower to avoid capital letters issue.
        clean_df[column] = clean_df[column].apply(lambda x: x.lower())
        # Remove punctuation using regex.
        clean_df[column] = clean_df[column].apply(lambda x: re.sub(r'[^\w\s]', '', x))
        # Remove numbers using regex.
        clean_df[column] = clean_df[column].apply(lambda x: re.sub(r'\d+', '', x))
        # Lemmatize the verbs.
        clean_df[column] = clean_df[column].apply(lambda x: " ".join([WordNetLemmatizer().lemmatize(word, 'v') for word in x.split()]))

    return clean_df


class Classifier():

    def __init__(self):
        self.mapping_dict = {'positive': 2, 'neutral': 1, 'negative': 0}
        self.reverse_mapping_dict = {v: k for k, v in self.mapping_dict .items()}
        self.tokenizer_self_bert = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
        self.model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3,
            output_attentions=False, output_hidden_states=False)
        self.batch_size = 16
        self.epochs = 8
        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr = 5e-5, eps = 1e-08) # Very low learning rate to finetune the model don't disturb too much the pretrained weights.


    def tokenize(self, df):
        token_df = df.copy()
        token_df['bert_encoded_dict'] = token_df['bert_encoded'].apply(
            lambda x: self.tokenizer_self_bert.encode_plus(text=x, add_special_tokens=True,
            padding='max_length', max_length=self.max_sentence_length, return_attention_mask=True))
        token_df = pd.concat([token_df.drop(['bert_encoded_dict'], axis=1), token_df['bert_encoded_dict'].apply(pd.Series)], axis=1)
        del token_df['token_type_ids']
        print(f"Input vectors final length: {np.vstack(token_df['input_ids'].apply(np.ravel)[0]).shape}")
        return token_df


    def train(self, train_filename: str, dev_filename: str, device: torch.device = device):
        """
        Trains the classifier model on the training set stored in file train_filename.
        """

        # We load the data and clean the text
        data = pd.read_csv(train_filename, sep='\t', header=None, names=['polarity', 'aspect', 'target', 'position', 'sentence'])
        clean_data = clean_sentences(data)

        # Before encoding, we need to know the size of the longest sequence to pad accordingly
        clean_data['bert_encoded'] = clean_data['sentence'].astype(str)  + '[SEP]' + clean_data['aspect'].astype(str) + '[SEP]' + clean_data['target'].astype(str)
        clean_data['bert_encoded_split'] = clean_data['bert_encoded'].str.split(' ')
        self.max_sentence_length = 128 # max([len(i) for i in clean_data['bert_encoded_split'].values]) # To find why not working
        print(f'Maximum sentence length in training data: {self.max_sentence_length}')
        # print(f'\n{clean_data.head()}\n')

        # Now we need to tokenize the text using BertTokenizer and to format the input vectors
        tokenize_data = self.tokenize(clean_data)
        # print(tokenize_data.head())
        tokenize_data['polarity'] = tokenize_data['polarity'].map(self.mapping_dict)
        token_ids = torch.tensor(np.vstack(tokenize_data['input_ids'].apply(np.ravel))).to(device)
        token_attention = torch.tensor(np.vstack(tokenize_data['attention_mask'].apply(np.ravel))).to(device)
        token_labels = torch.tensor(tokenize_data['polarity'].values).to(device)

        # Train set and prepare DataLoader
        train_set = TensorDataset(token_ids, token_attention, token_labels)
        train_dataloader = DataLoader(train_set, sampler=RandomSampler(train_set), batch_size=self.batch_size)

        # ---------- TRAINING LOOP ----------
        self.model = self.model.to(device)
        for epoch in range(self.epochs):
            self.model.train()
            tr_loss = 0

            for step, batch in enumerate(train_dataloader):
                b_input_ids, b_input_mask, b_labels = batch
                self.optimizer.zero_grad()
                # Forward pass
                train_output = self.model(b_input_ids, token_type_ids=None,
                                          attention_mask=b_input_mask, labels=b_labels)
                # Backward pass
                train_output.loss.backward()
                self.optimizer.step()
                # Update tracking variables
                tr_loss += train_output.loss.item()
            print(f'Epoch {epoch}: training loss = {tr_loss}')


    def predict(self, data_filename: str, device: torch.device=device) -> List[str]:
        """
        Predicts class labels for the input instances in file 'data_filename'.
        Returns the list of predicted labels.
        """
        
        # We load the test data and clean the text
        data_test = pd.read_csv(data_filename, sep = "\t", names = ['polarity', 'aspect', 'target', 'position', 'sentence'])
        clean_test_data = clean_sentences(data_test)

        # Again we use BertTokenizer to tokenize the text: target words and sentences
        clean_test_data['bert_encoded'] = clean_test_data['sentence'].astype(str)  + '[SEP]' + clean_test_data['aspect'].astype(str) + '[SEP]' + clean_test_data['target'].astype(str)
        # print(f'\n{clean_test_data.head()}\n')
        tokenize_test_data = self.tokenize(clean_test_data)
        # print(tokenize_test_data.head())
        
        # Format the test input vectors and prepare DataLoader
        test_token_ids = torch.tensor(np.vstack(tokenize_test_data['input_ids'].apply(np.ravel))).to(device)
        test_token_attention = torch.tensor(np.vstack(tokenize_test_data['attention_mask'].apply(np.ravel))).to(device)
        test_set = TensorDataset(test_token_ids, test_token_attention)
        test_dataloader = DataLoader(test_set, sampler=SequentialSampler(test_set), batch_size=self.batch_size)

        # ---------- INFERENCE LOOP ----------
        self.model.eval()
        self.pred = []
        for batch in test_dataloader:
            b_input_ids, b_input_mask = batch
            with torch.no_grad():
                # Forward pass
                eval_output = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            pred_polarity = np.argmax(eval_output.logits.cpu().detach().numpy(), axis=1)
            self.pred += [self.reverse_mapping_dict[x] for x in pred_polarity]

        return self.pred

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import time, sys
import numpy as np
import argparse
import torch


def set_reproducible():
    # The below is necessary to have reproducible behavior.
    import random as rn
    import os
    os.environ['PYTHONHASHSEED'] = '0'
    # The below is necessary for starting Numpy generated random numbers
    # in a well-defined initial state.
    np.random.seed(17)
    # The below is necessary for starting core Python generated random numbers
    # in a well-defined state.
    rn.seed(12345)


def load_label_output(filename):
    with open(filename, 'r', encoding='UTF-8') as f:
        return [line.strip().split("\t")[0] for line in f if line.strip()]


def eval_list(glabels, slabels):
    if (len(glabels) != len(slabels)):
        print("\nWARNING: label count in system output (%d) is different from gold label count (%d)\n" % (
        len(slabels), len(glabels)))
    n = min(len(slabels), len(glabels))
    incorrect_count = 0
    for i in range(n):
        if slabels[i] != glabels[i]: incorrect_count += 1
    acc = (n - incorrect_count) / n
    return acc*100


def train_and_eval(classifier, trainfile, devfile, testfile, run_id, device):
    print(f"\nRUN: {run_id}")
    print("  %s.1. Training the classifier..." % str(run_id))
    classifier.train(trainfile, devfile, device)
    print()
    print("  %s.2. Eval on the dev set..." % str(run_id), end="")
    slabels = classifier.predict(devfile, device)
    glabels = load_label_output(devfile)
    devacc = eval_list(glabels, slabels)
    print(" Acc.: %.2f" % devacc)
    testacc = -1
    if testfile is not None:
        # Evaluation on the test data
        print("  %s.3. Eval on the test set..." % str(run_id), end="")
        slabels = classifier.predict(testfile)
        glabels = load_label_output(testfile)
        testacc = eval_list(glabels, slabels)
        print(" Acc.: %.2f" % testacc)
    print()
    return (devacc, testacc)


if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_runs = 5
    set_reproducible()
    datadir = path_files
    trainfile =  datadir + "data/traindata.csv"
    devfile =  datadir + "data/devdata.csv"
    testfile = None
    # testfile = datadir + "testdata.csv"

    # Runs
    start_time = time.perf_counter()
    devaccs = []
    testaccs = []
    for i in range(1, n_runs+1):
        classifier =  Classifier()
        devacc, testacc = train_and_eval(classifier, trainfile, devfile, testfile, i, device)
        devaccs.append(np.round(devacc,2))
        testaccs.append(np.round(testacc,2))
    print('\nCompleted %d runs.' % n_runs)
    total_exec_time = (time.perf_counter() - start_time)
    print("Dev accs:", devaccs)
    print("Test accs:", testaccs)
    print()
    print("Mean Dev Acc.: %.2f (%.2f)" % (np.mean(devaccs), np.std(devaccs)))
    print("Mean Test Acc.: %.2f (%.2f)" % (np.mean(testaccs), np.std(testaccs)))
    print("\nExec time: %.2f s. ( %d per run )" % (total_exec_time, total_exec_time / n_runs))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


RUN: 1
  1.1. Training the classifier...
Maximum sentence length in training data: 128
Input vectors final length: (128, 1)
Epoch 0: training loss = 62.855787977576256
Epoch 1: training loss = 40.27422474324703
Epoch 2: training loss = 26.343893358949572
Epoch 3: training loss = 18.21130006434396
Epoch 4: training loss = 11.52524868492037
Epoch 5: training loss = 13.57446880324278
Epoch 6: training loss = 7.185713369981386
Epoch 7: training loss = 3.102986018988304

  1.2. Eval on the dev set...Input vectors final length: (128, 1)
 Acc.: 82.98



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


RUN: 2
  2.1. Training the classifier...
Maximum sentence length in training data: 128
Input vectors final length: (128, 1)
Epoch 0: training loss = 76.52230414748192
Epoch 1: training loss = 49.71830156445503
Epoch 2: training loss = 33.86152597516775
Epoch 3: training loss = 23.330589368008077
Epoch 4: training loss = 15.818208580836654
Epoch 5: training loss = 13.094537431956269
Epoch 6: training loss = 10.408812752459198
Epoch 7: training loss = 7.915469855535775

  2.2. Eval on the dev set...Input vectors final length: (128, 1)
 Acc.: 81.12



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


RUN: 3
  3.1. Training the classifier...
Maximum sentence length in training data: 128
Input vectors final length: (128, 1)
Epoch 0: training loss = 67.62197180464864
Epoch 1: training loss = 42.42142842710018
Epoch 2: training loss = 28.888348788022995
Epoch 3: training loss = 18.7381225368008
Epoch 4: training loss = 14.833952088607475
Epoch 5: training loss = 9.807166145415977
Epoch 6: training loss = 9.478793879738078
Epoch 7: training loss = 6.975359396717977

  3.2. Eval on the dev set...Input vectors final length: (128, 1)
 Acc.: 83.24



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


RUN: 4
  4.1. Training the classifier...
Maximum sentence length in training data: 128
Input vectors final length: (128, 1)
Epoch 0: training loss = 65.49192059785128
