In [1]:
#from google.colab import drive

#drive.mount('/content/gdrive')
#root = '/content/gdrive/MyDrive/Colab Notebooks/dataset/'

root = './Dataset/'

In [2]:
import numpy as np
import pandas as pd
import os
from tqdm.notebook import tqdm
import time
from torch.optim import Adam
from torch import nn
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoConfig, AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.utils import shuffle
import nltk
import matplotlib.pyplot as plt
import emoji
import copy
import pickle
pd.set_option('display.max_colwidth', None)


In [3]:
def add_hashtag(dataset):
    augmented_dataset = dataset.copy()
    augmented_dataset['hashtag'] = ' '
    data = []

    for i in tqdm(range(len(dataset)), desc= "Looking for hashtag"):
        text = dataset.loc[i,'text']
        dato = [k for k in text.split() if k.startswith("#")]
        data.append(dato)

    augmented_dataset['hashtag'] = data
    augmented_dataset['hashtag'] = augmented_dataset['hashtag'].apply(lambda x: " ".join(x).replace('#', ''))


    return augmented_dataset

In [4]:
import re
def add_emoji(dataset):

    augmented_dataset = dataset.copy()
    augmented_dataset['emoji'] = ' '

    for i in tqdm(range(len(dataset)), desc= "Looking for emoji"):
        text = dataset.loc[i,'text']
        data = emoji.demojize(text, language='it')
        pattern = r":(\w+):"

        emoji_found = re.findall(pattern,data)
        emoji_found = ' '.join(emoji_found)
        augmented_dataset.at[i,'emoji'] = emoji_found

    return augmented_dataset

In [5]:
hyperparameters = {
    "#_classes" : 1,
    "epochs": 30,
    "learning_rate": 1e-6,
    "batch_size": 3,
    "dropout": 0.1,
    "stopwords": False,
    "h_dim": 768,
    "patience": 10,
    "min_delta": 0.01,
    "language_model": "bert-base-multilingual-cased",
    "extra_features": 65, #32 emoji + 32 hashtag + 1 char_count
}

In [6]:
class ClassifierDeep(nn.Module):

    def __init__(self, labels, hdim, dropout, model_name,extra_features = hyperparameters['extra_features']):
        super(ClassifierDeep, self).__init__()
        config = AutoConfig.from_pretrained(model_name)
        self.lm_model = AutoModel.from_pretrained(model_name, config=config)
        self.classifier = nn.Sequential(
            nn.Linear(hdim + extra_features, 512),
            nn.Dropout(dropout),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.Dropout(dropout),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
            )

    def forward(self, input_id_text, attention_mask, char_count,emoji_text, hashtag):
        output = self.lm_model(input_id_text, attention_mask).last_hidden_state
        output = output[:,0,:]
        output = torch.cat((output, char_count.unsqueeze(-1), emoji_text, hashtag), dim=1)  # Concatena il conteggio dei caratteri
        return self.classifier(output)

In [11]:
with open("best_weight_original.pkl", 'rb') as f:
  weights = pickle.load(f)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(hyperparameters["language_model"])


result_dataset = pd.DataFrame(columns=['id', 'predicted_label'])



model = ClassifierDeep(hyperparameters["#_classes"],
                    hyperparameters["h_dim"],
                    hyperparameters["dropout"],
                    hyperparameters["language_model"]).to(device)
model.load_state_dict(weights)

model.eval()

test_set = pd.read_csv(f'{root}subtaskA_test.csv',  header=0, names=['id', 'text'])
test_set['text'] = test_set['text'].apply(lambda x: x.replace('\r', ' ').replace('\n', ' '))
test_set['char_count'] = test_set['text'].str.len()
test_set = add_emoji(test_set)
test_set = add_hashtag(test_set)
test_set.drop(columns=['id'], inplace=True)

batch_size = 5

with torch.no_grad():
    for i in tqdm(range(0,len(test_set),batch_size)):
        tokens = tokenizer(list(test_set["text"].iloc[i:i+batch_size]), add_special_tokens=True,
                                   return_tensors='pt', padding='max_length',
                                   max_length = 512, truncation=True)
    
        tokens_emoji = tokenizer(list(test_set["emoji"].iloc[i:i+batch_size]), add_special_tokens=True,
                            return_tensors='pt', padding='max_length',
                            max_length = 32, truncation=True)
        tokens_hashtag = tokenizer(list(test_set["hashtag"].iloc[i:i+batch_size]), add_special_tokens=True,
                                return_tensors='pt', padding='max_length',
                                max_length = 32, truncation=True)
    
        input_id_texts = tokens['input_ids'].squeeze(1).to(device)
        mask_texts = tokens['attention_mask'].squeeze(1).to(device)
    
        input_id_emoji = tokens_emoji['input_ids'].squeeze(1).to(device)
        mask_emoji = tokens_emoji['attention_mask'].squeeze(1).to(device)
    
        input_id_hashtag = tokens_hashtag['input_ids'].squeeze(1).to(device)
        mask_hashtag = tokens_hashtag['attention_mask'].squeeze(1).to(device)
    
        batch_char_count = [torch.tensor(char_count) for char_count in test_set["char_count"].iloc[i:i+batch_size]] 
        batch_char_count = torch.stack(batch_char_count)
        batch_char_count=batch_char_count.to(device)
    
        output = model(input_id_texts, mask_texts,batch_char_count,input_id_emoji, input_id_hashtag)
        
        input_id_texts = input_id_texts.detach().cpu()
        mask_texts = mask_texts.detach().cpu()
        output = output.detach().cpu()
        batch_char_count = batch_char_count.detach().cpu()
        output = output.round().numpy().tolist()
        current_results = pd.DataFrame({'predicted_label':output})
        result_dataset = pd.concat([result_dataset,current_results],ignore_index=True)


result_dataset['id'] = result_dataset.index


result_dataset.to_csv('results_original_datset.csv')
result_dataset.head()

            

Looking for emoji:   0%|          | 0/460 [00:00<?, ?it/s]

Looking for hashtag:   0%|          | 0/460 [00:00<?, ?it/s]

  0%|          | 0/92 [00:00<?, ?it/s]

Unnamed: 0,id,predicted_label
0,0,[1.0]
1,1,[0.0]
2,2,[0.0]
3,3,[0.0]
4,4,[0.0]


In [13]:
with open("best_weight.pkl", 'rb') as f:
  weights = pickle.load(f)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(hyperparameters["language_model"])


result_dataset = pd.DataFrame(columns=['id', 'predicted_label'])



model = ClassifierDeep(hyperparameters["#_classes"],
                    hyperparameters["h_dim"],
                    hyperparameters["dropout"],
                    hyperparameters["language_model"]).to(device)
model.load_state_dict(weights)

model.eval()

test_set = pd.read_csv(f'{root}subtaskA_test.csv',  header=0, names=['id', 'text'])
test_set['text'] = test_set['text'].apply(lambda x: x.replace('\r', ' ').replace('\n', ' '))
test_set['char_count'] = test_set['text'].str.len()
test_set = add_emoji(test_set)
test_set = add_hashtag(test_set)
test_set.drop(columns=['id'], inplace=True)

batch_size = 5

with torch.no_grad():
    for i in tqdm(range(0,len(test_set),batch_size)):
        tokens = tokenizer(list(test_set["text"].iloc[i:i+batch_size]), add_special_tokens=True,
                                   return_tensors='pt', padding='max_length',
                                   max_length = 512, truncation=True)
    
        tokens_emoji = tokenizer(list(test_set["emoji"].iloc[i:i+batch_size]), add_special_tokens=True,
                            return_tensors='pt', padding='max_length',
                            max_length = 32, truncation=True)
        tokens_hashtag = tokenizer(list(test_set["hashtag"].iloc[i:i+batch_size]), add_special_tokens=True,
                                return_tensors='pt', padding='max_length',
                                max_length = 32, truncation=True)
    
        input_id_texts = tokens['input_ids'].squeeze(1).to(device)
        mask_texts = tokens['attention_mask'].squeeze(1).to(device)
    
        input_id_emoji = tokens_emoji['input_ids'].squeeze(1).to(device)
        mask_emoji = tokens_emoji['attention_mask'].squeeze(1).to(device)
    
        input_id_hashtag = tokens_hashtag['input_ids'].squeeze(1).to(device)
        mask_hashtag = tokens_hashtag['attention_mask'].squeeze(1).to(device)
    
        batch_char_count = [torch.tensor(char_count) for char_count in test_set["char_count"].iloc[i:i+batch_size]] 
        batch_char_count = torch.stack(batch_char_count)
        batch_char_count=batch_char_count.to(device)
    
        output = model(input_id_texts, mask_texts,batch_char_count,input_id_emoji, input_id_hashtag)
        
        input_id_texts = input_id_texts.detach().cpu()
        mask_texts = mask_texts.detach().cpu()
        output = output.detach().cpu()
        batch_char_count = batch_char_count.detach().cpu()
        output = output.round().numpy().tolist()
        current_results = pd.DataFrame({'predicted_label':output})
        result_dataset = pd.concat([result_dataset,current_results],ignore_index=True)


result_dataset['id'] = result_dataset.index


result_dataset.to_csv('results_augmented_datset.csv')
result_dataset.head()

            

Looking for emoji:   0%|          | 0/460 [00:00<?, ?it/s]

Looking for hashtag:   0%|          | 0/460 [00:00<?, ?it/s]

  0%|          | 0/92 [00:00<?, ?it/s]

Unnamed: 0,id,predicted_label
0,0,[1.0]
1,1,[0.0]
2,2,[0.0]
3,3,[0.0]
4,4,[0.0]
