In [6]:
# -*- coding: utf-8 -*-
"""
Created on Sun Feb 23 19:46:33 2020

@author: Marco Cavalli X81000445
"""

"""LIBRERIE USATE"""

import pandas as pd
import re
from IPython.display import display, HTML
import textacy.preprocessing as txt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from stop_words import get_stop_words
pd.set_option('display.max_rows', 100)
pd.set_option('min_rows', 100)
pd.set_option('display.max_columns', 12)
pd.set_option('display.precision', 3)
pd.set_option('expand_frame_repr', False)

STOP_WORDS = get_stop_words('italian')
STOP_WORDS_H = 1.0
STOP_WORDS_L = 0.0
N_GRAMS = (1,1)

#Applica operazioni di preprocessing sul dataset
def clean(dataset):
    tweet_text = dataset['text'].values
    clean_text = cleanTweet(tweet_text)
    clean_df = pd.DataFrame(clean_text, columns=['x'])
    clean_df['label'] = dataset['handle'].values
    
    return clean_df

#Divide il dataset in TrS, TeS ed eventualmente VaS
def splitDataset(dataset,percent=0.25,knn=False,knn_percent=0.33):

    train_set, test_set, y_train_set, y_test_set  = train_test_split(dataset['x'].tolist(),dataset['label'].tolist(), test_size=percent)

    
    if knn:
        train_set, vali_set, y_train_set, y_vali_set  = train_test_split(train_set,y_train_set, test_size=knn_percent)
            
    
    if knn:
        return train_set, y_train_set, test_set, y_test_set, vali_set, y_vali_set
    else:
        return train_set, y_train_set, test_set, y_test_set

#Regressore Logistico
def LRegr(dataset, num_label, features=5000, tdf=True):
    
    train_set, y_train_set, test_set, y_test_set = splitDataset(dataset)
    
    count_vect = CountVectorizer(ngram_range=N_GRAMS, max_features=features, max_df=STOP_WORDS_H, min_df=STOP_WORDS_L, strip_accents='unicode', stop_words = STOP_WORDS)
    tfidf = TfidfTransformer(use_idf=tdf)

    
    if num_label > 2:
        log = LogisticRegression(solver='liblinear', multi_class='ovr')
    else:
        log = LogisticRegression(solver='liblinear')
        
    x_train_counts = count_vect.fit_transform(train_set)
    x_train = tfidf.fit_transform(x_train_counts)
    
    x_test_counts = count_vect.transform(test_set)
    x_test = tfidf.transform(x_test_counts)

    log.fit(x_train, y_train_set)

    y_train_preds = log.predict(x_train)
    y_test_preds = log.predict(x_test)

    #print("F1 training scores: {:0.2f}".format(f1_score(y_train_set,y_train_preds,average='weighted')))
    #print("F1 test scores: {:0.2f}".format(f1_score(y_test_set,y_test_preds,average='weighted')))
    
    return log, tfidf, count_vect

#KNN
def KNN(dataset, features=750, tdf=False, k=0):
    
    train_set, y_train_set, test_set, y_test_set, vali_set, y_vali_set = splitDataset(dataset, knn = True)
    
    count_vect = CountVectorizer(ngram_range=N_GRAMS, max_features=features, max_df=STOP_WORDS_H, min_df=STOP_WORDS_L, strip_accents='unicode', stop_words = STOP_WORDS)
    tfidf = TfidfTransformer(use_idf=tdf)
    
    x_train_counts = count_vect.fit_transform(train_set)
    x_train = tfidf.fit_transform(x_train_counts)
    
    x_vali_counts = count_vect.transform(vali_set)
    x_vali = tfidf.transform(x_vali_counts)
    
    x_test_counts = count_vect.transform(test_set)
    x_test = tfidf.transform(x_test_counts)
      
    
    if k == 0:
        print("Valuteremo ora che K assegnare per massimizzare le performances.")
        best_score = 0
        best_k = 0

        for k_value in range(1,11):
            knn = KNeighborsClassifier(n_neighbors=k_value)
            knn.fit(x_train, y_train_set)
            y_vali_preds = knn.predict(x_vali)
            print("{} - F1 Validation Score: {:0.2f}".format(k_value,f1_score(y_vali_set,y_vali_preds,average='weighted')))
            if f1_score(y_vali_set,y_vali_preds,average='weighted') > best_score:
                best_score = f1_score(y_vali_set,y_vali_preds,average='weighted')
                best_k = k_value

        print("Il miglior K è {}.".format(best_k))
    else:
        k_value = k
        
    knn = KNeighborsClassifier(n_neighbors=k_value)

    knn.fit(x_train, y_train_set)

    y_train_preds = knn.predict(x_train)
    y_test_preds = knn.predict(x_test)

    #print("F1 training scores: {:0.2f}".format(f1_score(y_train_set,y_train_preds,average='weighted')))
    #print("F1 test scores: {:0.2f}".format(f1_score(y_test_set,y_test_preds,average='weighted')))
    return knn, tfidf, count_vect

#Multinomial Naive Bayes
def MNB(dataset, features=5000, tdf=True):
    
    train_set, y_train_set, test_set, y_test_set = splitDataset(dataset)
    
    count_vect = CountVectorizer(ngram_range=N_GRAMS, max_features=features, max_df=STOP_WORDS_H, min_df=STOP_WORDS_L, strip_accents='unicode', stop_words = STOP_WORDS)
    tfidf = TfidfTransformer(use_idf=tdf)

    x_train_counts = count_vect.fit_transform(train_set)
    x_train = tfidf.fit_transform(x_train_counts)
    
    x_test_counts = count_vect.transform(test_set)
    x_test = tfidf.transform(x_test_counts)

    nb = MultinomialNB()
    nb.fit(x_train, y_train_set)

    y_train_preds = nb.predict(x_train)
    y_test_preds = nb.predict(x_test)

    #print("F1 training scores: {:0.2f}".format(f1_score(y_train_set,y_train_preds,average='weighted')))
    #print("F1 test scores: {:0.2f}".format(f1_score(y_test_set,y_test_preds,average='weighted')))
    
    return nb, tfidf, count_vect

    
#Stampa la lista dei dataframe salvati in memoria
def printDF(loc):
    import os
    directory = "../WEB-DATA/",loc
    fileList = os.listdir(directory)
    try:
        fileList =  [re.search('[A-Za-z]*[^.csv]', x).group(0) if x != '.csv' else '' for x in fileList]
    except Exception as e:
        print(e)
    print(fileList)
    
#Restituisce la lista dei dataframe salvati in memoria
def getDF(loc):
    import os
    directory = "../WEB-DATA/"+loc
    fileList = os.listdir(directory)
    try:
        fileList =  [re.search('[A-Z_a-z09]*[^.csv]', x).group(0) if x != '.csv' else '' for x in fileList]
    except Exception as e:
        print(e)
    return(fileList)

#Aggiunge un utente (se esiste) nel Dataset
def addUser(loc,labels):
    directory = "WEB-DATA/"+loc
    dataset = pd.DataFrame()
    for x in labels:
        user_df = pd.read_csv('../{}/{}.csv'.format(directory,x))
        dataset = pd.concat([dataset, user_df], axis=0, sort=True)
        #print("Aggiunto l'utente {}!".format(x))
    dataset = dataset.sort_values(by='retweet_count')
    dataset = dataset.drop(columns=['Unnamed: 0'])
    dataset = dataset.reset_index(drop=True)
    return dataset

#Rimuove un utente (se presente) dal Dataset
def dropUser(username,dataset,users_in_dt):
    print("Rimosso l'utente {}!".format(username))
    dt = dataset[dataset['handle'] != username]
    dt = dt.reset_index(drop=True)
    return dt

#Mostra una sezione del Dataset
def printDataset(dataset, low_lim = 0, high_lim = 0, user = None):
    if high_lim <= low_lim:
        high_lim = len(dataset)-1        
    if user == None:
        display(dataset[low_lim:high_lim])
    else:
        display(dataset[dataset['handle'] == user])
        

    
#ripuliamo una lista di tweet usando textacy
def cleanTweet(raw_data):
    data = [txt.replace_urls(x,"") for x in raw_data]
    data = [txt.replace_emails(x,"") for x in data]
    data = [txt.replace_emojis(x,"") for x in data]
    data = [txt.replace_user_handles(x,"") for x in data]
    data = [txt.replace_phone_numbers(x,"") for x in data]
    data = [txt.replace_numbers(x,"") for x in data]
    data = [txt.replace_currency_symbols(x,"") for x in data]
    data = [txt.replace_hashtags(x,"") for x in data]
    return data


"""MAIN"""

#import sys

message = 'Lasciatemi cantare con la chitarra in mano lasciatemi cantare sono un italiano Buongiorno Italia gli spaghetti al dente e un partigiano come Presidente autoradio sempre nella mano destra'
type_mes = 'CONDUTTORI'
type_clas = 'LOG_REG'

dataset = pd.DataFrame()
labels = getDF(type_mes)
dataset = addUser(type_mes, labels)

if type_clas == 'KNN':
    clas, tfd, vect = KNN(clean(dataset))
elif type_clas == 'MNB':
    clas, tfd, vect = MNB(clean(dataset))
else:
    clas, tfd, vect = LRegr(clean(dataset),len(labels))
    
mess_vect = vect.transform(cleanTweet(message))
mess_transformed = tfd.transform(mess_vect)

Probas_x = pd.DataFrame(clas.predict_proba(mess_transformed), columns = clas.classes_)
result = pd.DataFrame([message], columns=['Messaggio'])
result[clas.classes_] = Probas_x
output = []

import eli5
display(eli5.show_weights(clas, vec=vect, top=20,target_names=clas.classes_))

output.append(clas.classes_.tolist())
output.append(result.loc[0].tolist())

#print(output)

  'stop_words.' % sorted(inconsistent))


Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9
+4.238,oliver,,,,,,,,
+4.184,maelle,,,,,,,,
+3.044,adoro,,,,,,,,
+3.035,rai1official,,,,,,,,
+2.849,ns,,,,,,,,
+2.757,davvero,,,,,,,,
+2.732,sempre,,,,,,,,
+2.473,cane,,,,,,,,
+2.407,ricordi,,,,,,,,
+2.342,fantastica,,,,,,,,

Weight?,Feature
+4.238,oliver
+4.184,maelle
+3.044,adoro
+3.035,rai1official
+2.849,ns
+2.757,davvero
+2.732,sempre
+2.473,cane
+2.407,ricordi
+2.342,fantastica

Weight?,Feature
+3.290,camerino
+3.220,indizio
+3.175,amo
+3.005,gaetana
+2.776,abito
+2.752,mare
+2.683,video
+2.616,pronta
+2.366,anteprima
+2.338,look

Weight?,Feature
+8.119,ciao
+5.260,juve
+4.314,striscia
+3.501,monaco
+3.455,bacione
+3.431,film
+3.114,socio
+2.932,enzino
+2.874,famiglia
+2.807,londra

Weight?,Feature
+5.321,20
+4.376,23
+4.297,21
+3.673,domenica
+3.096,ospiti
+2.817,chefuoritempochefa
+2.738,vediamo
+2.605,rivediamo
+2.554,verita
+2.452,bg

Weight?,Feature
+2.508,notte
+2.433,direi
+2.368,zeber
+2.339,dici
+2.313,forse
+2.247,sinceramente
+2.221,zerbi
+2.165,capito
+2.136,amico
+2.132,parlavo

Weight?,Feature
+2.874,amore
+2.696,letterina
+2.541,cari
+2.468,libri
+2.374,gigia
+2.346,appello
+2.318,caro
+2.144,libro
+2.141,balenghi
+2.130,iniziamo

Weight?,Feature
+4.653,burattini
+3.619,benedico
+3.593,milly
+3.397,neanche
+3.240,defilippibers
+3.143,perche
+2.880,pure
+2.806,francesco
+2.727,defilippiber
+2.659,maurizio

Weight?,Feature
+5.241,bacio
+4.757,violenza
+3.908,love
+3.849,michelle
+3.770,mich
+3.347,scommessa
+2.820,bacino
+2.790,mua
+2.653,buon
+2.635,lilly

Weight?,Feature
+5.153,alle19
+3.305,stasera
+2.961,edicolafiore
+2.788,laos
+2.639,grandissimo
+2.321,domani
+2.248,rosario_fiorello
+2.228,vabbe
+2.071,puntata
+2.037,novembre

Weight?,Feature
+6.814,tramite
+4.019,sgarbi
+3.936,arte
+3.903,rinascimento
+3.856,maio
+3.753,mostra
+3.398,palazzo
+3.237,sindaco
+3.146,italia
+3.107,ministro
