In [1]:
import datetime
from time import time
import matplotlib.pyplot as plt
import nltk
import joblib
import itertools
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import plot_confusion_matrix, f1_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn import tree
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_validate
pd.set_option('display.max_rows', 10000)

In [2]:
#Stemmer
italian_stemmer = SnowballStemmer('italian')
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([italian_stemmer.stem(w) for w in analyzer(doc)])

In [3]:
#Data validation
def validate(date_text):
    try:
        datetime.datetime.strptime(date_text, '%Y-%m-%d')
    except ValueError:
        raise ValueError("Incorrect data format, should be YYYY-MM-DD")

In [4]:
#Function for searching for tweets from a specific period and related to a specific vaccine
def search_Vaxine_Period(df:pd.DataFrame, vaxine=None, start_date = "2021-02-01", end_date = "2021-05-31"):
    
    validate(start_date)
    validate(end_date)
       
    if vaxine == None:
        return df[(df['timestamp'] > start_date) & (df['timestamp'] < end_date)]    
    else:
        vaxine = vaxine.lower()
        return df[(df['timestamp'] > start_date) & (df['timestamp'] < end_date) & (df["tweet"].str.match(vaxine))]

In [5]:
#Function for opening the file that contain tweets and deleting the tweets of the most important italian newspapers and public channels
def OpenDataset(filename):
    #Load data and a short phase of preprocess
    test_set = pd.read_csv(filename,sep=',',usecols=['timestamp','username','tweet'])
    test_set.drop_duplicates(subset=['tweet'])
    test_set = test_set.dropna()
    
    news = []
    with open('AccountToDelete.txt') as f:
        for line in f:
            news.append(line.strip())
    
    indexNames = test_set[ test_set['username'].isin(news)].index
    test_set.drop(indexNames, inplace=True)
    
    return test_set

In [6]:
def doClassification(test_set:pd.DataFrame,clf,vaxine=None, start_date = "2021-02-01", end_date = "2021-05-31"):
    
    data = search_Vaxine_Period(test_set,vaxine,start_date,end_date).copy()    
    
    #Adopting the text classifier on a limited set of data
    data["label"]= clf.predict(data.tweet)
    data = data.sort_values(by=['label'], ascending=False)
    data.to_csv("tweetFiltered_labeled.csv")
    print("File tweetFiltered_labeled (that contains the predicted class for each label) has been created \n")
    
    #Count label
    label=[0,0,0]
    indexNames = data[data['label']=="label"].index
    data.drop(indexNames, inplace=True)
    data.label=data.label.astype(int)

    for w in data.label:
        if w == 0:
            label[0] = label[0] +1
        if w == 1:
            label[1] = label[1] +1
        if w == 2:
            label[2] = label[2] +1

    print("Classification result:")
    print("dataset len: " + str(len(data)))
    print("class 0 len: " + str(label[0]))
    print("class 1 len: " + str(label[1]))
    print("class 2 len: " + str(label[2]))
    
    #Pie-chart
    classes = ["Negative", "Positive", "Neutro"]
    plt.pie(label, labels=classes, radius=1.5, labeldistance=1.4)
    plt.show()    