In [2]:
!pip install datasets
!pip install clean-text
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface

In [3]:
import pandas as pd
from datasets import Dataset, DatasetDict
import re
from cleantext import clean
import torch
from sklearn.metrics import f1_score
from transformers import pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report



In [4]:
dataset = pd.read_csv("../datasets/subtaskB_train.csv")
dataset = dataset.drop('Id', axis=1)
dataset = dataset.drop('topic', axis=1)
dataset = dataset.rename(columns={'comment_text': 'text', 'conspiracy': 'label'})
dataset

Unnamed: 0,text,label
0,"Siamo davanti ad una prova, e non solo di quoz...",1
1,La storia dei 2 bimbi di Bergamo - vaccini: qu...,0
2,L'avete capito che non toglieranno il green pa...,0
3,Quindi la farsa dello spazio e della terra a ...,3
4,"In breve tempo, per accedere a Internet, sarà ...",1
...,...,...
805,"Incredibile!!!! EMA, Agenzia Europea del Farma...",0
806,Non ci saranno colloqui di pace con la Russia ...,2
807,"L'atmosfera è uno ""scudo protettivo"" che avvol...",3
808,OTTIMA NOTIZIA! Due ragioni per cui le élite n...,0


In [5]:
# We define the function for the cleaning of the text

def text_cleaning(text):
    # Convert words to lower case
    text = text.lower()
    text = clean(text, no_emoji=True)

    text = re.sub(r'(\[.*?\])', '', text)
    text = re.sub(r'[0-9]{2}\/[0-9]{2}\/[0-9]{2,4}', ' ', text)
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'[_"\%()|+&=*%#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\.+','.', text)
    text = re.sub(r'\,+',',', text)
    text = re.sub(r'\!+','!', text)
    text = re.sub(r'\?+','?', text)
    text = re.sub(r'\n+','', text)
    text = re.sub(r' +', ' ', text)
    text = re.sub('[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', '', text)

    return text

In [6]:
dataset['text'] = list(map(text_cleaning, dataset.text))
dataset

Unnamed: 0,text,label
0,"siamo davanti ad una prova, e non solo di quoz...",1
1,la storia dei 2 bimbi di bergamo - vaccini: qu...,0
2,l'avete capito che non toglieranno il green pa...,0
3,quindi la farsa dello spazio e della terra a p...,3
4,"in breve tempo, per accedere a internet, sara ...",1
...,...,...
805,"incredibile! ema, agenzia europea del farmaco,...",0
806,non ci saranno colloqui di pace con la russia ...,2
807,l'atmosfera e uno scudo protettivo che avvolge...,3
808,ottima notizia! due ragioni per cui le elite n...,0


In [7]:
# Bert model multilingual
model_name_multi = 'Mike00vito/best-multi-multiCLS'
#Bert model for italian
model_name_xxl = 'Mike00vito/best-xxl-multiCLS'

In [8]:
# We load the models from the hugging face hub
model_multi = pipeline(model=model_name_multi, tokenizer=model_name_multi, max_length=512, truncation=True, device=0, return_all_scores=True)
model_xxl = pipeline(model=model_name_xxl, tokenizer=model_name_xxl, max_length=512, truncation=True, device=0, return_all_scores=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/670M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.56M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]



Downloading (…)lve/main/config.json:   0%|          | 0.00/902 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/732k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [9]:
# We predict the labels for the all dataset using both de models
preds_multi = model_multi(list(dataset['text'][:]))
preds_xxl = model_xxl(list(dataset['text'][:]))

In [10]:
df_preds_multi0 = pd.DataFrame([x[0] for x in preds_multi])
df_preds_multi1 = pd.DataFrame([x[1] for x in preds_multi])
df_preds_multi2 = pd.DataFrame([x[2] for x in preds_multi])
df_preds_multi3 = pd.DataFrame([x[3] for x in preds_multi])
df_preds_xxl0 = pd.DataFrame([x[0] for x in preds_xxl])
df_preds_xxl1 = pd.DataFrame([x[1] for x in preds_xxl])
df_preds_xxl2 = pd.DataFrame([x[2] for x in preds_xxl])
df_preds_xxl3 = pd.DataFrame([x[3] for x in preds_xxl])

df_preds_multi0 = df_preds_multi0.drop('label', axis=1)
df_preds_multi1 = df_preds_multi1.drop('label', axis=1)
df_preds_multi2 = df_preds_multi2.drop('label', axis=1)
df_preds_multi3 = df_preds_multi3.drop('label', axis=1)
df_preds_xxl0 = df_preds_xxl0.drop('label', axis=1)
df_preds_xxl1 = df_preds_xxl1.drop('label', axis=1)
df_preds_xxl2 = df_preds_xxl2.drop('label', axis=1)
df_preds_xxl3 = df_preds_xxl3.drop('label', axis=1)

df_preds_multi0 = df_preds_multi0.rename(columns={'score': 'score_multi0'})
df_preds_multi1 = df_preds_multi1.rename(columns={'score': 'score_multi1'})
df_preds_multi2 = df_preds_multi2.rename(columns={'score': 'score_multi2'})
df_preds_multi3 = df_preds_multi3.rename(columns={'score': 'score_multi3'})
df_preds_xxl0 = df_preds_xxl0.rename(columns={'score': 'score_xxl0'})
df_preds_xxl1 = df_preds_xxl1.rename(columns={'score': 'score_xxl1'})
df_preds_xxl2 = df_preds_xxl2.rename(columns={'score': 'score_xxl2'})
df_preds_xxl3 = df_preds_xxl3.rename(columns={'score': 'score_xxl3'})

df_merged = pd.concat([df_preds_multi0, df_preds_multi1, df_preds_multi2, df_preds_multi3, df_preds_xxl0, df_preds_xxl1, df_preds_xxl2, df_preds_xxl3], axis=1)
df_merged

Unnamed: 0,score_multi0,score_multi1,score_multi2,score_multi3,score_xxl0,score_xxl1,score_xxl2,score_xxl3
0,1.992111e-09,1.000000e+00,1.657312e-09,2.189033e-09,2.974283e-09,1.000000e+00,4.386736e-09,6.029433e-09
1,1.000000e+00,1.207360e-09,1.148556e-09,9.782495e-10,9.998316e-01,1.162832e-04,4.917211e-05,2.877948e-06
2,1.000000e+00,1.492321e-09,1.188274e-09,1.020958e-09,1.000000e+00,8.799034e-09,1.079428e-08,1.303164e-08
3,5.195674e-09,6.792068e-09,3.604434e-09,1.000000e+00,8.169734e-09,3.740572e-08,1.371648e-08,1.000000e+00
4,3.105024e-09,1.000000e+00,2.750002e-09,2.823546e-09,3.482640e-08,9.999996e-01,1.889344e-07,1.128255e-07
...,...,...,...,...,...,...,...,...
805,1.000000e+00,1.476314e-09,1.048342e-09,1.073422e-09,1.000000e+00,7.624797e-09,7.243163e-09,1.128055e-08
806,7.198278e-09,8.197159e-09,1.000000e+00,6.927113e-09,7.904752e-09,8.451769e-09,1.000000e+00,9.615748e-09
807,6.330481e-09,6.907784e-09,3.890030e-09,1.000000e+00,7.006576e-09,1.305336e-08,9.388110e-09,1.000000e+00
808,1.000000e+00,1.327156e-09,1.097570e-09,9.881227e-10,1.000000e+00,3.448446e-09,2.083724e-09,2.537101e-09


In [11]:
param_grid_lr = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'max_iter': [20, 50, 100, 200, 500, 1000],                      
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],   
    'class_weight': ['balanced']                                    
}

In [12]:

from sklearn.linear_model import LogisticRegression

# We split the dataset built above to train the random forest
preds_train, preds_test, label_train, label_test = train_test_split(df_merged, dataset['label'], train_size=0.7, shuffle=True)

logModel_grid = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid_lr, verbose=1, cv=10, n_jobs=-1)
logModel_grid.fit(preds_train, label_train)

logistic = LogisticRegression(class_weight=logModel_grid.best_params_['class_weight'], solver=logModel_grid.best_params_['solver'], max_iter=logModel_grid.best_params_['max_iter'], C=logModel_grid.best_params_['C'])
logistic.fit(df_merged, dataset['label'])

Fitting 10 folds for each of 210 candidates, totalling 2100 fits


In [13]:
dataset_output = pd.read_csv("../datasets/subtaskB_test.csv")
dataset_output

Unnamed: 0,Id,comment_text
0,0,Ho fatto delle foto aeree ad una quota di circ...
1,1,Prof. Bellavite: tantissime persone costrette ...
2,2,"Ma si allontana.... su una terra PIATTA,......"
3,3,Non tutti gli Ukraini sono contro la Russia.\n...
4,4,Non opinioni ma dati.\n\nUno strepitoso **Giov...
...,...,...
295,295,Dopo l'esosfera viene lo spazio... giusto???
296,296,Ci sarebbero altri... basta che andate a veder...
297,297,"Certo certo, continuate a fare il gioco di Bid..."
298,298,Sapete cosa mi dicono gli astrofili?.. Che il ...


In [14]:
dataset_output['comment_text'] = list(map(text_cleaning, dataset_output.comment_text))
preds_out_multi = model_multi(list(dataset_output['comment_text'][:]))
preds_out_xxl = model_xxl(list(dataset_output['comment_text'][:]))

In [15]:
df_preds_out_multi0 = pd.DataFrame([x[0] for x in preds_out_multi])
df_preds_out_multi1 = pd.DataFrame([x[1] for x in preds_out_multi])
df_preds_out_multi2 = pd.DataFrame([x[2] for x in preds_out_multi])
df_preds_out_multi3 = pd.DataFrame([x[3] for x in preds_out_multi])
df_preds_out_xxl0 = pd.DataFrame([x[0] for x in preds_out_xxl])
df_preds_out_xxl1 = pd.DataFrame([x[1] for x in preds_out_xxl])
df_preds_out_xxl2 = pd.DataFrame([x[2] for x in preds_out_xxl])
df_preds_out_xxl3 = pd.DataFrame([x[3] for x in preds_out_xxl])

df_preds_out_multi0 = df_preds_out_multi0.drop('label', axis=1)
df_preds_out_multi1 = df_preds_out_multi1.drop('label', axis=1)
df_preds_out_multi2 = df_preds_out_multi2.drop('label', axis=1)
df_preds_out_multi3 = df_preds_out_multi3.drop('label', axis=1)
df_preds_out_xxl0 = df_preds_out_xxl0.drop('label', axis=1)
df_preds_out_xxl1 = df_preds_out_xxl1.drop('label', axis=1)
df_preds_out_xxl2 = df_preds_out_xxl2.drop('label', axis=1)
df_preds_out_xxl3 = df_preds_out_xxl3.drop('label', axis=1)

df_preds_out_multi0 = df_preds_out_multi0.rename(columns={'score': 'score_multi0'})
df_preds_out_multi1 = df_preds_out_multi1.rename(columns={'score': 'score_multi1'})
df_preds_out_multi2 = df_preds_out_multi2.rename(columns={'score': 'score_multi2'})
df_preds_out_multi3 = df_preds_out_multi3.rename(columns={'score': 'score_multi3'})
df_preds_out_xxl0 = df_preds_out_xxl0.rename(columns={'score': 'score_xxl0'})
df_preds_out_xxl1 = df_preds_out_xxl1.rename(columns={'score': 'score_xxl1'})
df_preds_out_xxl2 = df_preds_out_xxl2.rename(columns={'score': 'score_xxl2'})
df_preds_out_xxl3 = df_preds_out_xxl3.rename(columns={'score': 'score_xxl3'})

df_merged_out = pd.concat([df_preds_out_multi0, df_preds_out_multi1, df_preds_out_multi2, df_preds_out_multi3, df_preds_out_xxl0, df_preds_out_xxl1, df_preds_out_xxl2, df_preds_out_xxl3], axis=1)
df_merged_out

Unnamed: 0,score_multi0,score_multi1,score_multi2,score_multi3,score_xxl0,score_xxl1,score_xxl2,score_xxl3
0,5.389122e-09,1.725480e-08,3.623245e-09,1.000000e+00,6.030238e-09,2.167599e-08,1.979858e-08,1.000000e+00
1,1.000000e+00,1.489608e-09,1.179263e-09,1.017001e-09,1.000000e+00,3.004950e-09,2.043390e-09,2.556682e-09
2,4.916961e-09,8.563767e-09,3.763901e-09,1.000000e+00,1.019396e-08,7.034230e-08,1.556351e-08,9.999999e-01
3,8.009945e-09,8.375591e-09,1.000000e+00,5.580591e-09,1.195403e-08,1.988995e-08,1.000000e+00,1.912717e-08
4,1.000000e+00,1.262556e-09,1.099156e-09,1.016951e-09,1.000000e+00,2.332592e-09,2.462994e-09,2.513509e-09
...,...,...,...,...,...,...,...,...
295,7.023127e-09,7.216424e-09,3.905434e-09,1.000000e+00,5.197750e-08,1.468305e-06,1.365504e-07,9.999983e-01
296,1.000000e+00,1.287640e-09,1.108383e-09,1.143103e-09,1.000000e+00,1.018440e-08,1.971673e-08,2.321993e-08
297,3.890668e-09,1.000000e+00,2.314970e-09,2.060104e-09,1.725375e-08,1.000000e+00,1.692733e-08,3.755548e-08
298,5.437018e-09,6.728718e-09,3.654637e-09,1.000000e+00,1.252536e-07,2.123537e-07,1.663137e-07,9.999995e-01


In [16]:
preds_out_rf = logistic.predict(df_merged_out)
output_df = pd.DataFrame(preds_out_rf, columns=['Expected'])
output_df.index.name = 'Id'
output_df

Unnamed: 0_level_0,Expected
Id,Unnamed: 1_level_1
0,3
1,0
2,3
3,2
4,0
...,...
295,3
296,0
297,1
298,3


In [17]:
output_df.to_csv('../submissions/last_taskB.csv')