In [None]:
import csv
import matplotlib.pyplot as plt
import nltk
import pandas
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import plot_confusion_matrix, f1_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn import tree
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_validate
# import nltk
# nltk.download('stopwords')

pandas.set_option('display.max_rows', 1000)

# loading the dataset
dataset = pandas.read_csv("newDataset.csv",sep=',',usecols=['tweet','label'])
dataset.tweet=dataset.tweet.str.lower()
#print(type(dataset))
print(f"Number of tweets: {len(dataset)}")
display(dataset)

In [None]:
italian_stemmer = SnowballStemmer('italian')
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([italian_stemmer.stem(w) for w in analyzer(doc)])

In [None]:
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score, average = 'weighted', zero_division = 0),
           'recall' : make_scorer(recall_score, average = 'macro'), 
           'f1_score' : make_scorer(f1_score, average = 'macro')
          }

def print_metrics(scores):
    print(f'accuracy on test set: {scores["test_accuracy"].mean():.3} +/- {scores["test_accuracy"].std()*2:.3}')
    print(f'precision on test set: {scores["test_precision"].mean():.3} +/- {scores["test_precision"].std()*2:.3}')
    print(f'recall on test set: {scores["test_recall"].mean():.3} +/- {scores["test_recall"].std()*2:.3}')
    print(f'f1-score on test set: {scores["test_f1_score"].mean():.3} +/- {scores["test_f1_score"].std()*2:.3}')

In [None]:
max_ngram = 2

folds = 10
print('NAIVE BAYES CLASSIFIER')
for i in range(max_ngram):
    for j in range(max_ngram):
        if j>= i :
            text_clf = Pipeline([
                ('vect', StemmedCountVectorizer(min_df=3, analyzer="word", stop_words = set(stopwords.words('italian')),ngram_range = (i+1,j+1))),
                ('tfidf', TfidfTransformer(smooth_idf=True,use_idf=True)),
                ('clf', MultinomialNB()),
            ])
            scores = cross_validate(text_clf, dataset.tweet, dataset.label, cv=folds, scoring = scoring)
            print(f"------- ngram = {(i+1,j+1)} -------")
            print_metrics(scores)

In [None]:
print('SVM CLASSIFIER')
for i in range(max_ngram):
    for j in range(max_ngram):
        if j>= i :
            text_clf = Pipeline([
                ('vect', StemmedCountVectorizer(min_df=1, analyzer="word", stop_words = set(stopwords.words('italian')),ngram_range = (i+1,j+1))),
                ('tfidf', TfidfTransformer(smooth_idf=True,use_idf=True)),
                ('clf', svm.LinearSVC()),
            ])
            scores = cross_validate(text_clf, dataset.tweet, dataset.label, cv=folds, scoring=scoring)
            print(f"------- ngram = {(i+1,j+1)} -------")
            print_metrics(scores)

In [83]:
k_neighbor = 5
print('KNN CLASSIFIER')
for i in range(max_ngram):
    for j in range(max_ngram):
        if j>= i :
            text_clf = Pipeline([
                ('vect', StemmedCountVectorizer(min_df=1, analyzer="word", stop_words = set(stopwords.words('italian')),ngram_range = (i+1,j+1))),
                ('tfidf', TfidfTransformer(smooth_idf=True,use_idf=True)),
                ('clf', KNeighborsClassifier(k_neighbor)),
            ])
            scores = cross_validate(text_clf, dataset.tweet, dataset.label, cv=folds, scoring=scoring)
            print(f"------- ngram = {(i+1,j+1)} -------")
            print_metrics(scores)

KNN CLASSIFIER
------- ngram = (1, 1) -------
accuracy on test set: 0.508 +/- 0.102
precision on test set: 0.51 +/- 0.105
recall on test set: 0.51 +/- 0.0994
f1-score on test set: 0.506 +/- 0.102
------- ngram = (1, 2) -------
accuracy on test set: 0.503 +/- 0.0712
precision on test set: 0.511 +/- 0.0674
recall on test set: 0.506 +/- 0.073
f1-score on test set: 0.501 +/- 0.0693
------- ngram = (2, 2) -------
accuracy on test set: 0.457 +/- 0.104
precision on test set: 0.459 +/- 0.102
recall on test set: 0.455 +/- 0.102
f1-score on test set: 0.452 +/- 0.102


In [84]:
print('ADABOOST CLASSIFIER')
for i in range(max_ngram):
    for j in range(max_ngram):
        if j>= i :
            text_clf = Pipeline([
                ('vect', StemmedCountVectorizer(min_df=1, analyzer="word", stop_words = set(stopwords.words('italian')),ngram_range = (i+1,j+1))),
                ('tfidf', TfidfTransformer(smooth_idf=True,use_idf=True)),
                ('clf', AdaBoostClassifier()),
            ])
            scores = cross_validate(text_clf, dataset.tweet, dataset.label, cv=folds, scoring=scoring)
            print(f"------- ngram = {(i+1,j+1)} -------")
            print_metrics(scores)

ADABOOST CLASSIFIER
------- ngram = (1, 1) -------
accuracy on test set: 0.438 +/- 0.0947
precision on test set: 0.445 +/- 0.0998
recall on test set: 0.424 +/- 0.0926
f1-score on test set: 0.415 +/- 0.0913
------- ngram = (1, 2) -------
accuracy on test set: 0.413 +/- 0.114
precision on test set: 0.418 +/- 0.126
recall on test set: 0.404 +/- 0.106
f1-score on test set: 0.395 +/- 0.112
------- ngram = (2, 2) -------
accuracy on test set: 0.421 +/- 0.128
precision on test set: 0.442 +/- 0.221
recall on test set: 0.402 +/- 0.119
f1-score on test set: 0.357 +/- 0.141


In [85]:
print('RANDOM FOREST CLASSIFIER')
for i in range(max_ngram):
    for j in range(max_ngram):
        if j>= i :
            text_clf = Pipeline([
                ('vect', StemmedCountVectorizer(min_df=1, analyzer="word", stop_words = set(stopwords.words('italian')),ngram_range = (i+1,j+1))),
                ('tfidf', TfidfTransformer(smooth_idf=True,use_idf=True)),
                ('clf', RandomForestClassifier()),
            ])
            scores = cross_validate(text_clf, dataset.tweet, dataset.label, cv=folds, scoring=scoring)
            print(f"------- ngram = {(i+1,j+1)} -------")
            print_metrics(scores)

RANDOM FOREST CLASSIFIER
------- ngram = (1, 1) -------
accuracy on test set: 0.501 +/- 0.101
precision on test set: 0.51 +/- 0.108
recall on test set: 0.485 +/- 0.11
f1-score on test set: 0.482 +/- 0.117
------- ngram = (1, 2) -------
accuracy on test set: 0.487 +/- 0.0893
precision on test set: 0.515 +/- 0.0943
recall on test set: 0.467 +/- 0.0888
f1-score on test set: 0.454 +/- 0.0962
------- ngram = (2, 2) -------
accuracy on test set: 0.438 +/- 0.0869
precision on test set: 0.515 +/- 0.134
recall on test set: 0.408 +/- 0.079
f1-score on test set: 0.376 +/- 0.0963


In [None]:
# code for confusion matrix
from sklearn.model_selection import cross_val_predict
text_clf = Pipeline([
    ('vect', StemmedCountVectorizer(min_df=2, analyzer="word", stop_words = set(stopwords.words('italian')),ngram_range = (1,2))),
    ('tfidf', TfidfTransformer(smooth_idf=True,use_idf=True)),
    ('clf', MultinomialNB()),
])
y_pred = cross_val_predict(text_clf, dataset.tweet, dataset.label, cv=folds)
#conf_mat = confusion_matrix(dataset.label, y_pred, normalize="true")
conf_mat = confusion_matrix(dataset.label, y_pred)
print(conf_mat)

In [79]:
# code for custom stop words
stop_words = [x.strip() for x in open('stopWords.txt','r').read().split('\n')]

folds = 10
print('NAIVE BAYES CLASSIFIER - CUSTOM STOP WORDS')
for i in range(max_ngram):
    for j in range(max_ngram):
        if j>= i :
            text_clf = Pipeline([
                ('vect', StemmedCountVectorizer(min_df=2, analyzer="word", strip_accents = "unicode", stop_words = stop_words, ngram_range = (i+1,j+1))),
                ('tfidf', TfidfTransformer(smooth_idf=True,use_idf=True)),
                ('clf', MultinomialNB()),
            ])
            scores = cross_validate(text_clf, dataset.tweet, dataset.label, cv=folds, scoring = scoring)
            print(f"------- ngram = {(i+1,j+1)} -------")
            print_metrics(scores)

NAIVE BAYES CLASSIFIER - CUSTOM STOP WORDS
------- ngram = (1, 1) -------
accuracy on test set: 0.521 +/- 0.112
precision on test set: 0.56 +/- 0.107
recall on test set: 0.497 +/- 0.116
f1-score on test set: 0.493 +/- 0.126
------- ngram = (1, 2) -------
accuracy on test set: 0.527 +/- 0.122
precision on test set: 0.575 +/- 0.125
recall on test set: 0.501 +/- 0.124
f1-score on test set: 0.494 +/- 0.139
------- ngram = (2, 2) -------
accuracy on test set: 0.47 +/- 0.0927
precision on test set: 0.479 +/- 0.112
recall on test set: 0.449 +/- 0.09
f1-score on test set: 0.443 +/- 0.0947
