# SVM baseline - all datasets


In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Tommaso Caselli
#

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
working_dir = '/content/drive/MyDrive/Teaching/22_23/Jacopo/SVM'

### Get GPU

(if not found: Edit > Notebook Settings > Hardware Accelerator: GPU)

Identify and specify the GPU as the device.

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
!pip install scikit-learn emoji

Collecting emoji
  Downloading emoji-2.7.0.tar.gz (361 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m361.8/361.8 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: emoji
  Building wheel for emoji (pyproject.toml) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-2.7.0-py2.py3-none-any.whl size=356563 sha256=a96b424ce416653de2c19ed3306307bcc4ed93042e35bff82eded47e54537dd2
  Stored in directory: /root/.cache/pip/wheels/41/11/48/5df0b9727d5669c9174a141134f10304d1d78a3b89a4676f3d
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-2.7.0


In [None]:
import numpy as np
import sys, re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import GridSearchCV
import nltk
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
import emoji
import csv
import json

nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Split Data

In [None]:
"""
This cell generates the train and test splits for the purpose of the veracity classification experiments of the claims
The train and test splits are not really valid splits for CheckIT
For replicatbility purposes, we leave the code and make available the data.
"""

#with open(f'{working_dir}/join_d1_original.json', 'r') as f:
#  all_data = json.load(f)
#
#with open(f'{working_dir}/dev_test_10percent.json', 'r') as f1:
#  dev_data = json.load(f1)

#
#all_data_dict = {}
#ten_percent_data = {}

#for element in all_data:
#  all_data_dict[element['id']] = (element['statement'], element['politician'], element['verdict'])
#
#for element in dev_data:
#  ten_percent_data[element['id']] = (element['statement'], element['politician'], element['verdict'])
#
#training2clean = {}
#
#for k, v in all_data_dict.items():
#  if k not in  ten_percent_data:
##    training2clean[k] = v


In [None]:
#import random
#
#training_clean = dict(random.sample(training2clean.items(), 2861))
#
#test_data = {}
#
#for k, v in training2clean.items():
#  if k not in training_clean:
#    test_data[k] = v

#test_data.update(ten_percent_data)
#
#train_X = []
#train_labels = []
#test_X = []
#test_labels = []


#for k, v in training_clean.items():
#  train_X.append(v[0])
#  train_labels.append(v[2])


#for k, v in test_data.items():
#  test_X.append(v[0])
#  test_labels.append(v[2])

#label_mapping ={'Vero': 0, 'Nì':1, 'Falso':2}
#
#train_labels_mapped = list(map(lambda x: label_mapping[x], train_labels))
#test_labels_mapped = list(map(lambda x: label_mapping[x], test_labels))

with open(f'{working_dir}/training_data.json', 'w', encoding='utf-8') as f:
  json.dump(training_clean, f, ensure_ascii=False, indent=3)

with open(f'{working_dir}/test_data.json', 'w', encoding='utf-8') as f1:
  json.dump(test_data, f1, ensure_ascii=False, indent=3)


since Python 3.9 and will be removed in a subsequent version.
  training_clean = dict(random.sample(training2clean.items(), 2861))


## Training

In [None]:
# Train model

svm_classifier = SVC(kernel='linear', C=1.0)

# unweighted word uni and bigrams
tfidf_word = TfidfVectorizer(ngram_range=(1, 2), stop_words=stopwords.words('italian'))
tfidf_char = TfidfVectorizer(analyzer='char', ngram_range=(2, 5))

text_features = FeatureUnion([('word', tfidf_word),
                              ('char', tfidf_char),
                                  ])

pipeline_svm = Pipeline([("features", text_features), ("svm", svm_classifier)])
pipeline_svm.fit(train_X, train_labels_mapped)

In [None]:
# predict on test
print('Predicting on test...')
Yguess = pipeline_svm.predict(test_X) # test data

print(classification_report(test_labels_mapped, Yguess, digits=4))
#report_dict = classification_report(test_labels, Yguess, output_dict=True)
#df_results = pd.DataFrame.from_dict(report_dict)
#df_results.to_csv(f'{working_dir}/{task}/results.csv', index = False) # change file name for cross-domai

Predicting on test...
              precision    recall  f1-score   support

           0     0.4587    0.3922    0.4228       255
           1     0.4572    0.5738    0.5089       298
           2     0.3871    0.2945    0.3345       163

    accuracy                         0.4455       716
   macro avg     0.4343    0.4202    0.4221       716
weighted avg     0.4418    0.4455    0.4386       716



In [None]:
# Train w. politicians' name
train_X_politicians = []
test_X_politicians = []

for k, v in training_clean.items():
  train_X_politicians.append(v[0] + " " + v[1])


for k, v in test_data.items():
  test_X_politicians.append(v[0] + " " + v[1])

print(test_X_politicians)


['«Quelli che vennero uccisi [nell’attentato di via Rasella n.d.r.] non erano biechi nazisti delle SS ma una banda di semi-pensionati, una banda musicale». Giorgia Meloni', '«Abbiamo 98 milioni di prestazioni sanitarie arretrate» Carlo Calenda', '«Il lago di Garda è pieno di acqua solo al 25 per cento, il lago di Como solo al 17 per cento» Angelo Bonelli', '«Noi spendiamo 30 miliardi per gli incentivi sul fossile, i cosiddetti “ambientalmente dannosi”» Giuseppe Conte', '«Il canone in bolletta fu una mia scelta. Costava 113 euro. Averlo fatto pagare a tutti ha portato a un abbassamento del costo da 113 a 90 euro» Matteo Renzi', '«I nostri due candidati in Lombardia e Lazio ottengono più voti delle scorse regionali. Le nostre liste, oltre il 20 per cento, prendono più delle politiche» Enrico Letta', '«L’Italia è un Paese nel quale un contratto su dieci dura un giorno» Nicola Fratoianni', '«Gli obiettivi sul clima sono stati dati perché il 2050 suonava bene» Carlo Calenda', '«In 16 anni d

In [None]:
# Train model

svm_classifier2 = SVC(kernel='linear', C=1.0)

text_features2 = FeatureUnion([('word', tfidf_word),
                              ('char', tfidf_char),
                                  ])

pipeline_svm2 = Pipeline([("features", text_features2), ("svm", svm_classifier2)])
pipeline_svm2.fit(train_X_politicians, train_labels_mapped)

In [None]:
# predict on test w. politicians' names
print('Predicting on test...')
Yguess2 = pipeline_svm2.predict(test_X_politicians) # test data

print(classification_report(test_labels_mapped, Yguess2, digits=4))
#report_dict = classification_report(test_labels, Yguess, output_dict=True)
#df_results = pd.DataFrame.from_dict(report_dict)
#df_results.to_csv(f'{working_dir}/{task}/results.csv', index = False) # change file name for cross-domai

Predicting on test...
              precision    recall  f1-score   support

           0     0.4565    0.4118    0.4330       255
           1     0.4496    0.5537    0.4962       298
           2     0.4118    0.3006    0.3475       163

    accuracy                         0.4455       716
   macro avg     0.4393    0.4220    0.4256       716
weighted avg     0.4434    0.4455    0.4399       716



In [None]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(train_X_politicians, train_labels_mapped)
Yguess_dummy = dummy_clf.predict(test_X_politicians)
print(classification_report(test_labels_mapped, Yguess_dummy, digits=4))


              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       255
           1     0.4162    1.0000    0.5878       298
           2     0.0000    0.0000    0.0000       163

    accuracy                         0.4162       716
   macro avg     0.1387    0.3333    0.1959       716
weighted avg     0.1732    0.4162    0.2446       716



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
## OVERLAP POLITICIANS

train_politician = set()
for k, v in training_clean.items():
  claim, politician, label = v
  train_politician.add(politician)
  #print(politician)

overlap_set = set()

for k1, v1 in test_data.items():
    claim_test, politician_test, label_test = v1
    if politician_test in train_politician:
        overlap_set.add(politician_test)

print(len(overlap_set))


121
