In [3]:
!pip install datasets
!pip install clean-text
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m63.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting dill<0.3.7,>=0.3.0
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 k

In [4]:
import pandas as pd
from datasets import Dataset, DatasetDict
import re
from cleantext import clean
import torch
from sklearn.metrics import f1_score
from transformers import pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report



In [5]:
dataset = pd.read_csv("../datasets/subtaskA_train_aug.csv")
dataset = dataset.drop('Id', axis=1)
dataset = dataset.rename(columns={'comment_text': 'text', 'conspiratorial': 'label'})
dataset

Unnamed: 0,text,label
0,se non ci fossero soldati non ci sarebbero gue...,0
1,shedding of infectious sars-cov-2 despite vac...,1
2,paura e delirio alla cnn: il ministero della v...,1
3,l'aspirina non aumenta la sopravvivenza dei pa...,0
4,l'italia non puo' dare armi lo vieta la costit...,0
...,...,...
3679,Abraccia la terra spesso 2000 km oooh mi ricor...,1
3680,Tuttavia le recensioni erano negative anche pr...,0
3681,"Nel frattempo, in Kazakistan, le persone che h...",1
3682,una profezia ad una conferenza nel 2015: il pr...,1


In [6]:
# We define the function for the cleaning of the text

def text_cleaning(text):
    # Convert words to lower case
    text = str(text)
    text = text.lower()
    text = clean(text, no_emoji=True)

    text = re.sub(r'(\[.*?\])', '', text)
    text = re.sub(r'[0-9]{2}\/[0-9]{2}\/[0-9]{2,4}', ' ', text)
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'[_"\%()|+&=*%#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\.+','.', text)
    text = re.sub(r'\,+',',', text)
    text = re.sub(r'\!+','!', text)
    text = re.sub(r'\?+','?', text)
    text = re.sub(r'\n+','', text)
    text = re.sub(r' +', ' ', text)
    text = re.sub('[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', '', text)

    return text

In [7]:
dataset['text'] = list(map(text_cleaning, dataset.text))
dataset

Unnamed: 0,text,label
0,se non ci fossero soldati non ci sarebbero gue...,0
1,shedding of infectious sars-cov-2 despite vacc...,1
2,paura e delirio alla cnn: il ministero della v...,1
3,l'aspirina non aumenta la sopravvivenza dei pa...,0
4,l'italia non puo' dare armi lo vieta la costit...,0
...,...,...
3679,abraccia la terra spesso 2000 km oooh mi ricor...,1
3680,tuttavia le recensioni erano negative anche pr...,0
3681,"nel frattempo, in kazakistan, le persone che h...",1
3682,una profezia ad una conferenza nel 2015: il pr...,1


# Model 
Here we download the best models that we trained before on the dataset. With them we will build an ensemble to improve the predictions.

The idea is to build a random forest over the probability of each texts' prediction of the models over the dataset.

In [8]:
# Bert model multilingual
model_name_multi = 'Mike00vito/best-multi-singleCLS'
#Bert model for italian
model_name_xxl = 'Mike00vito/best-xxl-singleCLS'

In [9]:
# We load the models from the hugging face hub
model_multi = pipeline(model=model_name_multi, tokenizer=model_name_multi, max_length=512, truncation=True, device=0, return_all_scores=True)
model_xxl = pipeline(model=model_name_xxl, tokenizer=model_name_xxl, max_length=512, truncation=True, device=0, return_all_scores=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/897 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/670M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.56M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]



Downloading (…)lve/main/config.json:   0%|          | 0.00/710 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/732k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [10]:
# We predict the labels for the all dataset using both de models
preds_multi = model_multi(list(dataset['text'][:]))
preds_xxl = model_xxl(list(dataset['text'][:]))

In [11]:
df_preds_multi0 = pd.DataFrame([x[0] for x in preds_multi])
df_preds_multi1 = pd.DataFrame([x[1] for x in preds_multi])
df_preds_xxl0 = pd.DataFrame([x[0] for x in preds_xxl])
df_preds_xxl1 = pd.DataFrame([x[1] for x in preds_xxl])

df_preds_multi0 = df_preds_multi0.drop('label', axis=1)
df_preds_multi1 = df_preds_multi1.drop('label', axis=1)
df_preds_xxl0 = df_preds_xxl0.drop('label', axis=1)
df_preds_xxl1 = df_preds_xxl1.drop('label', axis=1)

df_preds_multi0 = df_preds_multi0.rename(columns={'score': 'score_multi0'})
df_preds_multi1 = df_preds_multi1.rename(columns={'score': 'score_multi1'})
df_preds_xxl0 = df_preds_xxl0.rename(columns={'score': 'score_xxl0'})
df_preds_xxl1 = df_preds_xxl1.rename(columns={'score': 'score_xxl1'})

df_merged = pd.concat([df_preds_multi0, df_preds_multi1, df_preds_xxl0, df_preds_xxl1], axis=1)
df_merged

Unnamed: 0,score_multi0,score_multi1,score_xxl0,score_xxl1
0,0.772824,0.227176,0.999995,5.130631e-06
1,0.086922,0.913078,0.000003,9.999969e-01
2,0.002483,0.997517,0.000003,9.999969e-01
3,0.997233,0.002767,0.999999,6.496987e-07
4,0.945351,0.054649,1.000000,3.307125e-07
...,...,...,...,...
3679,0.002677,0.997323,0.000004,9.999962e-01
3680,0.985651,0.014349,0.999985,1.510307e-05
3681,0.003288,0.996712,0.000010,9.999903e-01
3682,0.002818,0.997182,0.000003,9.999969e-01


In [12]:
param_grid_lr = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'max_iter': [20, 50, 100, 200, 500, 1000],                      
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],   
    'class_weight': ['balanced']                                    
}

In [14]:
from sklearn.linear_model import LogisticRegression

# We split the dataset built above to train the random forest
preds_train, preds_test, label_train, label_test = train_test_split(df_merged, dataset['label'], train_size=0.7, shuffle=True)

logModel_grid = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid_lr, verbose=1, cv=10, n_jobs=-1)
logModel_grid.fit(preds_train, label_train)

logistic = LogisticRegression(class_weight=logModel_grid.best_params_['class_weight'], solver=logModel_grid.best_params_['solver'], max_iter=logModel_grid.best_params_['max_iter'], C=logModel_grid.best_params_['C'])
logistic.fit(df_merged, dataset['label'])

Fitting 10 folds for each of 210 candidates, totalling 2100 fits


# Predict new texts

Now, we will read the dataset of the texts that we want to predict. We will take the prediction probabilities of each model, and finally using the random forest we will assign a class to each of them.

In [15]:
dataset_output = pd.read_csv("../datasets/subtaskA_test.csv")
dataset_output

Unnamed: 0,Id,comment_text
0,0,La CNN ha appena ammesso che Trump non è più a...
1,1,"Notizia del 2017: “Autovelox, la foto viola la..."
2,2,"La foto di due ""sospetti"" in un palazzo non è ..."
3,3,La falsa notizia della bambina con il cartello...
4,4,Ah dimenticavo.. e' gratuita
...,...,...
455,455,Cina e Arabia Saudita intensificano la coopera...
456,456,La Polizia e l'avvocato Holzeisen diffidano l'...
457,457,W la libertà contro tutte le dittature
458,458,nell'unità militare di Okhtyrka sono ancora in...


In [16]:
dataset_output['comment_text'] = list(map(text_cleaning, dataset_output.comment_text))
preds_out_multi = model_multi(list(dataset_output['comment_text'][:]))
preds_out_xxl = model_xxl(list(dataset_output['comment_text'][:]))

In [17]:
df_preds_out_multi0 = pd.DataFrame([x[0] for x in preds_out_multi])
df_preds_out_xxl0 = pd.DataFrame([x[0] for x in preds_out_xxl])
df_preds_out_multi1 = pd.DataFrame([x[1] for x in preds_out_multi])
df_preds_out_xxl1 = pd.DataFrame([x[1] for x in preds_out_xxl])

df_preds_out_multi0 = df_preds_out_multi0.drop('label', axis=1)
df_preds_out_xxl0 = df_preds_out_xxl0.drop('label', axis=1)
df_preds_out_multi1 = df_preds_out_multi1.drop('label', axis=1)
df_preds_out_xxl1 = df_preds_out_xxl1.drop('label', axis=1)

df_preds_out_multi0 = df_preds_out_multi0.rename(columns={'score': 'score_multi0'})
df_preds_out_xxl0 = df_preds_out_xxl0.rename(columns={'score': 'score_xxl0'})
df_preds_out_multi1 = df_preds_out_multi1.rename(columns={'score': 'score_multi1'})
df_preds_out_xxl1 = df_preds_out_xxl1.rename(columns={'score': 'score_xxl1'})

df_merged_out = pd.concat([df_preds_out_multi0, df_preds_out_multi1, df_preds_out_xxl0, df_preds_out_xxl1], axis=1)
df_merged_out

Unnamed: 0,score_multi0,score_multi1,score_xxl0,score_xxl1
0,0.003127,0.996873,0.000003,9.999967e-01
1,0.997302,0.002698,1.000000,4.126600e-07
2,0.860076,0.139924,0.999991,8.941070e-06
3,0.994797,0.005203,0.999980,2.035973e-05
4,0.986394,0.013606,0.999987,1.304109e-05
...,...,...,...,...
455,0.997013,0.002987,1.000000,3.675356e-07
456,0.002652,0.997348,0.000003,9.999965e-01
457,0.006594,0.993406,0.000005,9.999951e-01
458,0.017883,0.982117,0.999613,3.869881e-04


In [18]:
preds_out_rf = logistic.predict(df_merged_out)
output_df = pd.DataFrame(preds_out_rf, columns=['Expected'])
output_df.index.name = 'Id'
output_df

Unnamed: 0_level_0,Expected
Id,Unnamed: 1_level_1
0,1
1,0
2,0
3,0
4,0
...,...
455,0
456,1
457,1
458,0


In [19]:
output_df.to_csv('../submissions/last_aug.csv')