In [1]:
#from google.colab import drive

#drive.mount('/content/gdrive')
#root = '/content/gdrive/MyDrive/Colab Notebooks/dataset/'

root = './Dataset/'

In [2]:
import numpy as np
import pandas as pd
import os
from tqdm.notebook import tqdm
import time
from torch.optim import Adam
from torch import nn
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoConfig, AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.utils import shuffle
import nltk
import matplotlib.pyplot as plt
import emoji
import copy
import pickle
pd.set_option('display.max_colwidth', None)


In [3]:
def add_hashtag(dataset):
    augmented_dataset = dataset.copy()
    augmented_dataset['hashtag'] = ' '
    data = []

    for i in tqdm(range(len(dataset)), desc= "Looking for hashtag"):
        text = dataset.loc[i,'text']
        dato = [k for k in text.split() if k.startswith("#")]
        data.append(dato)

    augmented_dataset['hashtag'] = data
    augmented_dataset['hashtag'] = augmented_dataset['hashtag'].apply(lambda x: " ".join(x).replace('#', ''))


    return augmented_dataset

In [4]:
hyperparameters = {
    "#_classes" : 4,
    "epochs": 30,
    "learning_rate": 1e-6,
    "batch_size": 3,
    "dropout": 0.1,
    "stopwords": False,
    "h_dim": 768,
    "patience": 10,
    "min_delta": 0.01,
    "language_model": "bert-base-multilingual-cased",
    "extra_features": 32, #32 hashtag
}

In [5]:
class ClassifierDeep(nn.Module):

    def __init__(self, labels, hdim, dropout, model_name,extra_features = hyperparameters['extra_features']):
        super(ClassifierDeep, self).__init__()
        config = AutoConfig.from_pretrained(model_name)
        self.lm_model = AutoModel.from_pretrained(model_name, config=config)
        self.classifier = nn.Sequential(
            nn.Linear(hdim + extra_features, 512),
            nn.Dropout(dropout),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.Dropout(dropout),
            nn.ReLU(),
            nn.Linear(128, labels),
            )

    def forward(self, input_id_text, attention_mask, hashtag):
        output = self.lm_model(input_id_text, attention_mask).last_hidden_state
        output = output[:,0,:]
        output = torch.cat((output, hashtag), dim=1)  # Concatena il conteggio dei caratteri
        return self.classifier(output)

In [None]:
with open("best_weight_original.pkl", 'rb') as f:
  weights = pickle.load(f)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(hyperparameters["language_model"])


result_dataset = pd.DataFrame(columns=['Id', 'predicted_label'])
result_dataset.set_index('Id', inplace=True)


model = ClassifierDeep(hyperparameters["#_classes"],
                    hyperparameters["h_dim"],
                    hyperparameters["dropout"],
                    hyperparameters["language_model"]).to(device)
model.load_state_dict(weights)

model.eval()

test_set = pd.read_csv(f'{root}subtaskB_test.csv',  header=0, names=['id', 'text'])
test_set['text'] = test_set['text'].apply(lambda x: x.replace('\r', ' ').replace('\n', ' '))
test_set = add_hashtag(test_set)
test_set.drop(columns=['id'], inplace=True)

batch_size = 5

with torch.no_grad():
    for i in tqdm(range(0,len(test_set),batch_size)):
        tokens = tokenizer(list(test_set["text"].iloc[i:i+batch_size]), add_special_tokens=True,
                                   return_tensors='pt', padding='max_length',
                                   max_length = 512, truncation=True)
    
        tokens_hashtag = tokenizer(list(test_set["hashtag"].iloc[i:i+batch_size]), add_special_tokens=True,
                                return_tensors='pt', padding='max_length',
                                max_length = 32, truncation=True)
    
        input_id_texts = tokens['input_ids'].squeeze(1).to(device)
        mask_texts = tokens['attention_mask'].squeeze(1).to(device)
    
        input_id_hashtag = tokens_hashtag['input_ids'].squeeze(1).to(device)
        mask_hashtag = tokens_hashtag['attention_mask'].squeeze(1).to(device)
    
        output = model(input_id_texts, mask_texts, input_id_hashtag)
        softmax = nn.LogSoftmax(dim=1)
        output = softmax(output).argmax(dim=1)
        
        input_id_texts = input_id_texts.detach().cpu()
        mask_texts = mask_texts.detach().cpu()
        output = output.detach().cpu()
        output = [int(i[0]) for i in output]
        current_results = pd.DataFrame({'predicted_label':output})
        
        result_dataset = pd.concat([result_dataset,current_results],ignore_index=True)


result_dataset.to_csv('results_original_datset.csv')
result_dataset.head()

            

In [None]:
with open("best_weight.pkl", 'rb') as f:
  weights = pickle.load(f)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(hyperparameters["language_model"])


result_dataset = pd.DataFrame(columns=['Id', 'predicted_label'])
result_dataset.set_index('Id', inplace=True)


model = ClassifierDeep(hyperparameters["#_classes"],
                    hyperparameters["h_dim"],
                    hyperparameters["dropout"],
                    hyperparameters["language_model"]).to(device)
model.load_state_dict(weights)

model.eval()

test_set = pd.read_csv(f'{root}subtaskB_test.csv',  header=0, names=['id', 'text'])
test_set['text'] = test_set['text'].apply(lambda x: x.replace('\r', ' ').replace('\n', ' '))
test_set = add_hashtag(test_set)
test_set.drop(columns=['id'], inplace=True)

batch_size = 5

with torch.no_grad():
    for i in tqdm(range(0,len(test_set),batch_size)):
        tokens = tokenizer(list(test_set["text"].iloc[i:i+batch_size]), add_special_tokens=True,
                                   return_tensors='pt', padding='max_length',
                                   max_length = 512, truncation=True)
    
        
        tokens_hashtag = tokenizer(list(test_set["hashtag"].iloc[i:i+batch_size]), add_special_tokens=True,
                                return_tensors='pt', padding='max_length',
                                max_length = 32, truncation=True)
    
        input_id_texts = tokens['input_ids'].squeeze(1).to(device)
        mask_texts = tokens['attention_mask'].squeeze(1).to(device)
    
        input_id_hashtag = tokens_hashtag['input_ids'].squeeze(1).to(device)
        mask_hashtag = tokens_hashtag['attention_mask'].squeeze(1).to(device)
    
        output = model(input_id_texts, mask_texts, input_id_hashtag)
        softmax = nn.LogSoftmax(dim=1)
        output = softmax(output).argmax(dim=1)
        
        input_id_texts = input_id_texts.detach().cpu()
        mask_texts = mask_texts.detach().cpu()
        output = output.detach().cpu()
        output = [int(i[0]) for i in output]
        current_results = pd.DataFrame({'predicted_label':output})
        
        result_dataset = pd.concat([result_dataset,current_results],ignore_index=True)


result_dataset.to_csv('results_augmented_datset.csv')
result_dataset.head()

            

La differenza tra le risposte è quantificabile controllando il numero di risposte che variano in base ai pesi, usando il sito [diffchecker](https://www.diffchecker.com/) è possibile dare in input i due testi e trovare le differenze tra loro.

Per il Task B, le risposte differenti tra i due pesi sono 56 su 300, per una variazione del 18,667% circa.