In [None]:
import pandas as pd
import re

from copy import deepcopy
from itertools import chain
from math import ceil
from tqdm import tqdm

import torch
from torch import nn

from transformers import RobertaModel
from transformers import RobertaTokenizer

In [None]:
import pickle5 as pickle

df = None
with open('./sustainalytics_dataset.pkl', 'rb') as fh:
  df = pickle.load(fh)

df.head()

Unnamed: 0,name,rating,content
0,1&1 Drillisch AG,21.1,1&1 Drillisch AG (formerly known as: Drillisch...
1,2i Rete Gas SpA,37.2,"Headquartered in Milan, 2i Rete Gas is the se..."
2,"2U, Inc.",15.7,"2U, Inc. (formerly 2tor Inc.) is an American e..."
4,3i Group PLC,12.6,3i Group plc is a British multinational privat...
5,3i Infrastructure PLC,22.1,3i Infrastructure plc (LSE: 3IN) is an investm...


In [None]:
df = df[df['content'].apply(lambda content: len(content.split(' '))) <= 5500]
df = df[df['content'].apply(lambda content: len(content.split(' '))) >= 400]
len(df)

2100

In [None]:
def clean_text(content):
    content = re.sub('\n', ' ', content)
    content = re.sub('[=]{2,}', ' ', content)
    content = re.sub('[ ]{2,}', ' ', content)
    return content

df['content'] = df['content'].apply(clean_text)

X = list(df['content'])
y = list(df['rating'])

print(len(X))

print(min(y))
print(max(y))
print(sum(y)/len(y))

2100
7.2
67.4
24.88809523809527


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
print(len(X_train))
print(len(X_test))

# Nombre moyens de mots par article wiki
print(df['content'].apply(lambda content: len(content.split(' '))).mean())

1890
210
1544.3685714285714


In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
y_train = scaler.fit_transform(np.array(y_train, dtype=float).reshape(-1, 1))
y_train = y_train.reshape(-1)

In [None]:
max_length = 200
overlap_length = 50

def split_tokens_sequence(tokens_sequence):
    n = len(tokens_sequence)
    sequences = list()
    
    first_chunk = torch.tensor(tokens_sequence[0 : min(max_length, n)])
    if max_length > n:
        first_chunk = nn.functional.pad(first_chunk, (0, max_length - n))
    sequences.append(first_chunk)
    
    for i in range(ceil((n - max_length) / (max_length - overlap_length))):
        initial_pos = (max_length - overlap_length)*(i + 1)
        final_pos = min((max_length - overlap_length)*(i + 1) + max_length, n)
        sequences.append(torch.tensor(tokens_sequence[initial_pos : final_pos]))
    
    sequences = nn.utils.rnn.pad_sequence(sequences, batch_first=True)
    
    return sequences

In [None]:
class Model:

    def __init__(self, max_items_per_batch, scaler):
        self.max_items_per_batch = max_items_per_batch
        self.scaler = scaler

        self.model = RobertaModel.from_pretrained('roberta-base')
        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

        self.lstm = nn.LSTM(input_size=768, hidden_size=1, num_layers=1)
        
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.model = self.model.to(self.device)
        self.lstm = self.lstm.to(self.device)

        print(f"Running model on {self.device}")
    

    def forward(self, sequences_lengths, input_ids_sequences, attention_masks_sequences=None, labels=None):
        """Compute forward propagation

        Args:
            sequences_lengths (List[int]): List of the lengths of each 
                sequence
            input_ids_sequences (Tensor): Tensor with the list of the list of 
                each tokens sequence
            attention_masks_sequences (Tensor, optional): Tensor with the list 
                of the list of each attention masks sequence. 
                Defaults to None.
            labels (Tensor, optional): Tensor with the list of labels. 
                Defaults to None.

        Returns:
            Tensor: If the labels are specified, returns the loss; otherwise, 
                returns the predicted outputs
        """
        outputs = self.model(input_ids_sequences, attention_masks_sequences)['pooler_output']

        max_sequence_length = max(sequences_lengths)
        outputs_grouped = list()
        index = 0
        for length in sequences_lengths:
            outputs_grouped.append( 
                nn.functional.pad(outputs[index:index+length,:], 
                                  (0, 0, max_sequence_length - length, 0))
            )
            index += length
        lstm_inputs = nn.utils.rnn.pad_sequence(outputs_grouped, batch_first=True)

        outputs = torch.flatten(self.lstm(lstm_inputs)[0][:,-1,:])

        if labels is not None:
            loss_fct = nn.MSELoss()
            loss = loss_fct(outputs.float(), labels.float())
            return loss
        else:
            return outputs
    

    def predict(self, X, y=None):
        """Predict the outputs of a batch of inputs

        Args:
            X (List[str]): The list of strings wich outputs to predict. 
                If one of the entries is too long to be handled, raises a 
                RuntimeError, except when the list of labels `y` is known and 
                given to the function
            y (List[float], optional): The list of labels. If specified, 
                ables the model to remove the not handable entries, and 
                returns the labels related to the handable entries. 
                Defaults to None.

        Raises:
            RuntimeError: If one element of `X` is too long to be handled 
                (the number of chunks of tokens is too high, above 
                `self.max_items_per_batch`), raises a RuntimeError

        Returns:
            Tensor, Optional[List[float]]: Returns the predicted Tensor. 
                If `y` is given, returns also the list of labels from which 
                are removed all the not handable entries.
        """
        if y is not None:
            y_copy = deepcopy(y)
        
        self.model.eval()

        input_ids = self.tokenizer(X)['input_ids']
        list_tokens_sequences = list(map(split_tokens_sequence, input_ids))

        batches = list()
        batch = list()
        sequences_lengths = list()

        indexes_to_pop = list()
        for index, tokens_sequence in enumerate(list_tokens_sequences):
            
            if len(tokens_sequence) > self.max_items_per_batch:
                if y is None:
                    raise RuntimeError(f"The sequence must be shorter than {self.max_items_per_batch} chunks")
                indexes_to_pop.append(index)

                continue
            
            if sum(sequences_lengths) + len(tokens_sequence) <= self.max_items_per_batch:
                batch.append(tokens_sequence)
                sequences_lengths.append(len(tokens_sequence))
            
            else:
                batches.append((batch, sequences_lengths))

                batch = [tokens_sequence]
                sequences_lengths = [len(tokens_sequence)]
        
        for index in reversed(indexes_to_pop):
            y_copy.pop(index)

        if len(batch) > 0:
            batches.append((batch, sequences_lengths))
        
        outputs = list()
        with torch.no_grad():
            for list_tokens_sequences, sequences_lengths in batches:
                input_ids_sequences = torch.cat(list_tokens_sequences).to(self.device)
                outputs.append(self.forward(sequences_lengths, input_ids_sequences))
        
        if y is None:
            return torch.cat(outputs)
        
        return torch.cat(outputs), y_copy
    

    def evaluate(self, X, y, normalized_y=False, return_random_error=True):
        """Compute the average prediction error of the model, given a 
        test dataset

        Args:
            X (List[str]): List of inputs
            y (List[float]): List of outputs
            normalized_y (bool, optional): Specifies if `y` variables have 
                been normalized. Defaults to False.
            return_random_error (bool, optional): If `True`, computes the 
                random error, which corresponds to the default error if the 
                model always returned the average value. Defaults to True.

        Returns:
            float, Optional[float]: Returns the average prediction error. 
                If `return_random_error` is `True`, also returns the random
                average error.
        """
        outputs, y = self.predict(X, y)
        outputs = outputs.cpu().numpy()
        outputs = self.scaler.inverse_transform(outputs)

        if normalized_y:
            y = self.scaler.inverse_transform(y)

        mean_error = np.abs(outputs - np.array(y)).mean()

        if return_random_error:
            random_error = np.abs(scaler.mean_ - np.array(y)).mean()
            return mean_error, random_error
        
        return mean_error
    

    def freeze_encoder(self):
        for param in self.model.parameters():
            param.requires_grad = False
    

    def unfreeze_encoder(self):
        for param in self.model.parameters():
            param.requires_grad = True
    

    def train(self, X, y, n_epochs, max_batch_size, lr_init, lr_max, n_epochs_finetuning=0, X_test=None, y_test=None):
        """Train the model on a given dataset.
            * Randomly order the samples at the beginning of each epoch
            * Apply the "triangular learning rates" policy: computes a list 
              of adaptative learning rates, first linearly increasing 
              from `lr_init` to `lr_max`, then linearly decreasing 
              back to `lr_init`

        Args:
            X (List[str]): List of inputs
            y (List[float]): List of outputs
            n_epochs (int): Number of epochs
            max_batch_size (int): Maximum size of a batch (due to the specific
                model, the size of the batch is not constant, but 
                `max_batch_size` specifies its maximum value)
            lr_init (float): Initial value of the learning rate
            lr_max (float): Maximal value of the learning rate
            X_test (List[str], optional): List of test inputs.
                If not `None` and `y_test` not `None`, at the end of each 
                epoch, computes the evaluation of the model on this test 
                dataset. Defaults to None.
            y_test (List[float], optional): List of test outputs.
                If not `None` and `X_test` not `None`, at the end of each 
                epoch, computes the evaluation of the model on this test 
                dataset. Defaults to None.
        """
        rise = np.linspace(lr_init, lr_max, ceil(n_epochs / 2))
        lrs = np.concatenate((rise, np.flip(rise)[1:], 
                              np.array([lr_init])))[:n_epochs]

        if X_test is not None and y_test is not None:
            print(f"\nInit, evaluation: {self.evaluate(X_test, y_test)}")

        # Fine-tuning the superficial layers
        self.freeze_encoder()

        optimizer = torch.optim.AdamW(self.lstm.parameters(), 
                                      lr=lr_init)

        for epoch in range(n_epochs_finetuning):
            indexes_order = np.random.choice(range(len(X)), 
                                             len(X), 
                                             replace=False)
            list_tokens_sequences = list()
            list_attention_masks = list()
            list_labels = list()
            sequences_lengths = list()

            for index in tqdm(indexes_order):

                optimizer.zero_grad()

                tokenized = self.tokenizer(X[index])
                input_ids, attention_masks = tokenized['input_ids'], tokenized['attention_mask']
                 
                tokens_sequences = split_tokens_sequence(input_ids)
                attention_masks_sequences = split_tokens_sequence(attention_masks)

                # Ignore the item
                if len(tokens_sequences) > self.max_items_per_batch:
                    continue
                
                # Run the batch then create a new one
                if len(sequences_lengths) == max_batch_size or sum(sequences_lengths) + len(tokens_sequences) > self.max_items_per_batch:
                    input_ids_sequences = torch.cat(list_tokens_sequences).to(self.device)
                    attention_masks = torch.cat(list_attention_masks).to(self.device)
                    
                    loss = self.forward(sequences_lengths, 
                                        input_ids_sequences, 
                                        attention_masks,
                                        labels=torch.tensor(list_labels).to(self.device))

                    loss.backward()
                    optimizer.step()

                    list_tokens_sequences = [tokens_sequences]
                    list_attention_masks = [attention_masks_sequences]
                    list_labels = [y[index]]
                    sequences_lengths = [len(tokens_sequences)]
                
                # Add the current item to the batch
                else:
                    list_tokens_sequences.append(tokens_sequences)
                    list_attention_masks.append(attention_masks_sequences)
                    list_labels.append(y[index])
                    sequences_lengths.append(len(tokens_sequences))

            # Run last item    
            input_ids_sequences = torch.cat(list_tokens_sequences).to(self.device)
            attention_masks_sequences = torch.cat(list_attention_masks).to(self.device)
            
            loss = self.forward(sequences_lengths, 
                                input_ids_sequences, 
                                attention_masks_sequences,
                                labels=torch.tensor(list_labels).to(self.device))

            loss.backward()
            optimizer.step()
            
            if X_test is not None and y_test is not None:
                print(f"\nEpoch: {epoch}, evaluation: {self.evaluate(X_test, y_test)}")

        # Training the whole model

        self.unfreeze_encoder()
        self.model.train()
        optimizer = torch.optim.AdamW(chain(self.model.parameters(), 
                                            self.lstm.parameters()), 
                                      lr=lrs[0])

        for epoch in range(n_epochs):

            # Update the learning rate
            if epoch > 0:
                print(f"LR: {lrs[epoch]}")
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lrs[epoch]

            indexes_order = np.random.choice(range(len(X)), 
                                             len(X), 
                                             replace=False)
            list_tokens_sequences = list()
            list_attention_masks = list()
            list_labels = list()
            sequences_lengths = list()

            for index in tqdm(indexes_order):

                optimizer.zero_grad()

                tokenized = self.tokenizer(X[index])
                input_ids, attention_masks = tokenized['input_ids'], tokenized['attention_mask']
                 
                tokens_sequences = split_tokens_sequence(input_ids)
                attention_masks_sequences = split_tokens_sequence(attention_masks)

                # Ignore the item
                if len(tokens_sequences) > self.max_items_per_batch:
                    continue
                
                # Run the batch then create a new one
                if len(sequences_lengths) == max_batch_size or sum(sequences_lengths) + len(tokens_sequences) > self.max_items_per_batch:
                    input_ids_sequences = torch.cat(list_tokens_sequences).to(self.device)
                    attention_masks = torch.cat(list_attention_masks).to(self.device)
                    
                    loss = self.forward(sequences_lengths, 
                                        input_ids_sequences, 
                                        attention_masks,
                                        labels=torch.tensor(list_labels).to(self.device))

                    loss.backward()
                    optimizer.step()

                    list_tokens_sequences = [tokens_sequences]
                    list_attention_masks = [attention_masks_sequences]
                    list_labels = [y[index]]
                    sequences_lengths = [len(tokens_sequences)]
                
                # Add the current item to the batch
                else:
                    list_tokens_sequences.append(tokens_sequences)
                    list_attention_masks.append(attention_masks_sequences)
                    list_labels.append(y[index])
                    sequences_lengths.append(len(tokens_sequences))

            # Run last item    
            input_ids_sequences = torch.cat(list_tokens_sequences).to(self.device)
            attention_masks_sequences = torch.cat(list_attention_masks).to(self.device)
            
            loss = self.forward(sequences_lengths, 
                                input_ids_sequences, 
                                attention_masks_sequences,
                                labels=torch.tensor(list_labels).to(self.device))

            loss.backward()
            optimizer.step()

            print(f"Loss: {loss.item()}")
            
            if X_test is not None and y_test is not None:
                print(f"\nEpoch: {epoch}, evaluation: {self.evaluate(X_test, y_test)}")


In [None]:
max_items_per_batch = 40
model = Model(max_items_per_batch, scaler)

Running model on cuda:0


In [None]:
model.train(X_train, y_train, n_epochs=2, max_batch_size=3, 
            lr_init=1e-5, lr_max=1e-4, n_epochs_finetuning=0,
            X_test=X_test, y_test=y_test)

Token indices sequence length is longer than the specified maximum sequence length for this model (2019 > 512). Running this sequence through the model will result in indexing errors
  0%|          | 0/1890 [00:00<?, ?it/s]


Init, evaluation: (7.085056528230993, 7.067837398373984)


100%|██████████| 1890/1890 [12:39<00:00,  2.49it/s]


Loss: 2.5052616596221924


  0%|          | 0/1890 [00:00<?, ?it/s]


Epoch: 0, evaluation: (7.067838836297756, 7.067837398373984)
LR: 1e-05


100%|██████████| 1890/1890 [12:26<00:00,  2.53it/s]


Loss: 0.7577447891235352

Epoch: 1, evaluation: (7.067837477893364, 7.067837398373984)


In [None]:
predictions = model.predict(X_test[:5])
predictions

tensor([8.4782e-18, 4.1742e-20, 6.6936e-22, 3.9714e-17, 1.4084e-21],
       device='cuda:0')