# Import das Bibliotecas

In [1]:
import pandas as pd
import numpy as np

#utils
import emoji, re, string, time, os

#nlp
import nltk
from nltk.corpus import stopwords
import spacy

#features
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

#models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.ensemble  import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier

#CV
from sklearn.model_selection import cross_validate


# Carregamento do dataset

In [2]:
original_df = pd.read_csv("C:/Users/lmb3/source/repos/FakeWhatsApp.Br-master/data/2018/fakeWhatsApp.BR_2018.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
original_df = original_df[original_df['midia']==0]
original_df = original_df[original_df['viral']==1]
original_df.shape

(20872, 19)

In [4]:
original_df = original_df.drop_duplicates(subset=['text'])
original_df.shape

(5284, 19)

In [5]:
text_df = original_df[["text","misinformation"]]

In [6]:
text_df["misinformation"].value_counts()

 0    2547
 1    2041
-1     696
Name: misinformation, dtype: int64

In [7]:
text_df = text_df[text_df["misinformation"]>-1]
text_df.reset_index(inplace=True)
text_df = text_df.drop('index',axis=1)
text_df

Unnamed: 0,text,misinformation
0,"No dia 07 de Outubro, quando for votar, não es...",1
1,Devido a aborrecimentos nas varias redes socia...,0
2,EU GOSTARIA DE PEDIR A TODOS DO GRUPOS ENTREM ...,0
3,Como são as coisas.\nChefe do jacaré aparece n...,1
4,A ratoeira funcionou mais a ratazana aguentou ...,0
...,...,...
4583,Amoedo - Itaú - Metacapitalistas (George Soros...,1
4584,*BEM GALERA... O VIDEO DA PSICOLOGIA REVERSA P...,0
4585,DIVULGUEM ESSE VIDEO PARA QUE TODOS SAIBAM QUE...,1
4586,https://www.oantagonista.com/brasil/exclusivo-...,0


# Pre-Processamento

## Remocao de StopWords, sao mantidas pontuacoes e emojis

In [8]:
import nltk
#nltk.download('punkt')
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import emoji

In [9]:
def domainUrl(text):
    '''
    Substitutes an URL in a text for the domain of this URL
    Input: an string
    Output: the string with the modified URL
    '''    
    if 'http' in text:
        re_url = '[^\s]*https*://[^\s]*'
        matches = re.findall(re_url, text, flags=re.IGNORECASE)
        for m in matches:
            domain = m.split('//')
            domain = domain[1].split('/')[0]
            text = re.sub(re_url, domain, text, 1)
        return text
    else:
        return text

def processLoL(text):
    re_kkk = 'kkk*'
    t = re.sub(re_kkk, "kkkk", text, flags=re.IGNORECASE)
    return t

def processEmojisPunctuation(text):
    '''
    Put spaces between emojis. Removes punctuation.
    '''
    
    emojis_list = list(emoji.UNICODE_EMOJI.keys())
    emojis_list += ['\n']
    punct = list(string.punctuation) + ['\n']
    emojis_punct = emojis_list + punct
    
    #get all unique chars
    chars = set(text)
    #for each unique char in text, do:
    for c in chars:
        if c in punct:
            text = text.replace(c, ' ' + c + ' ')
        
        if c in emojis_list:
            text = text.replace(c, ' ' + c + ' ')                        
            
    text = re.sub(' +', ' ', text)
    return text
    
    
stop_words = list(stopwords.words('portuguese'))
new_stopwords = ['aí','pra','vão','vou','onde','lá','aqui',
                 'tá','pode','pois','so','deu','agora','todo',
                 'nao','ja','vc', 'bom', 'ai','ta', 'voce', 'alguem', 'ne', 'pq',
                 'cara','to','mim','la','vcs','tbm', 'tudo']
stop_words = stop_words + new_stopwords
final_stop_words = []
for sw in stop_words:
    sw = ' '+ sw + ' '
    final_stop_words.append(sw)

def removeStopwords(text):
    for sw in final_stop_words:
        text = text.replace(sw,' ')
    text = re.sub(' +',' ',text)
    return text
    
def preprocess(text):

    text = text.lower().strip()
    text = domainUrl(text)
    text = processLoL(text)
    text = processEmojisPunctuation(text)
    text = removeStopwords(text)

    return text

'''
def puct_stopword_removal(msg):
    
    stop_words = stopwords.words('portuguese')
    new_stopwords = ['aí','pra','vão','vou','onde','lá','aqui',
                     'tá','pode','pois','so','deu','agora','todo',
                     'nao','ja','vc', 'bom', 'ai','ta', 'voce', 'alguem', 'ne', 'pq',
                     'cara','to','mim','la','vcs','tbm', 'tudo']
    stop_words = stop_words + new_stopwords
    punctuation = list(string.punctuation)+['...']
    
    minusculas_msg = msg.lower().strip()
    msgs_tokens = word_tokenize(minusculas_msg)

    cleaned_msg_tokens = [token for token in msgs_tokens if token not in stop_words and token not in punctuation]
    clean_msgs = ' '.join(cleaned_msg_tokens) 
    
    return clean_msgs
'''

In [10]:
text_df['pre_processed_text'] = [preprocess(t) for t in text_df['text']]

In [11]:
text_df['pre_processed_text']

0       no dia 07 outubro , votar , esqueça esquerdopa...
1       devido aborrecimentos varias redes sociais pos...
2       eu gostaria pedir todos grupos entrem neste li...
3       como coisas . \n chefe jacaré aparece video ba...
4       a ratoeira funcionou ratazana aguentou dor cal...
                              ...                        
4583    amoedo - itaú - metacapitalistas ( george soro...
4584     * bem galera . . . video psicologia reversa a...
4585    divulguem video todos saibam bolsonaro . . . 😑...
4586    www . oantagonista . * exclusivo : bolsonaro f...
4587     * # bolsonaro2018 * 🇧🇷 \n * 1 * - foco trocar...
Name: pre_processed_text, Length: 4588, dtype: object

In [12]:
X = text_df['pre_processed_text']
y = text_df['misinformation']

# Vetorização

In [13]:
#TFIDF
vectorizer_tfidf = TfidfVectorizer(ngram_range = (1,3),token_pattern = r'[^\s]+')#,max_features=20000
tfidf_transformer = vectorizer_tfidf.fit(X)
tfidf_vect = tfidf_transformer.transform(X)
tfidf_lsit = tfidf_vect.toarray()
tfidf_df = pd.DataFrame(data=tfidf_lsit,columns = tfidf_transformer.get_feature_names()) #,columns = tfidf_transformer.get_feature_names()
y = y.reset_index(drop=True)
tfidf_df.insert(len(tfidf_df.columns), 'target', y)
tfidf_df

Unnamed: 0,!,! !,! ! !,"! ! """,! ! #,! ! (,! ! ),! ! *,"! ! ,",! ! -,...,🧓🏼 sogra 12,🧔🏻,🧔🏻 música,🧔🏻 música existe,🧔🏻 sim,"🧔🏻 sim ,",🧔🏽,🧔🏽 vizinho,🧔🏽 vizinho 15,target
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4583,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4584,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4585,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4586,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


# Feature Selection

## Chi2

In [14]:
from sklearn.feature_selection import chi2

bestfeatures_chi2 = chi2(tfidf_df.iloc[:,:-1], tfidf_df.iloc[:,-1])

# Criação do Dataframe
dfscores_chi2 = pd.DataFrame(bestfeatures_chi2[0])
dfpvalues_chi2 = pd.DataFrame(bestfeatures_chi2[1])
dfcolumns_chi2 = pd.DataFrame(tfidf_df.iloc[:,:-1].columns)
featureScores_chi2 = pd.concat([dfcolumns_chi2,dfscores_chi2,dfpvalues_chi2],axis=1)
featureScores_chi2.columns = ['Atributo','Score','P-valor']

tfidf_chi_df = tfidf_df.loc[:,featureScores_chi2.sort_values('P-valor',ascending = True)[:10000]['Atributo']]
y = y.reset_index(drop=True)
tfidf_chi_df.insert(len(tfidf_chi_df.columns), 'target', y)
tfidf_chi_df

Unnamed: 0,whatsapp,. whatsapp .,chat,chat . whatsapp,chat .,. whatsapp,whatsapp .,entrar grupo,link entrar grupo,link entrar,...,sabia aproximadamente,recebem aposentadoria ex,recebem aposentadoria,desse benefício,outros recebem benefício,“bolsa ditadura”,receio perderem,além desse benefício,receio perderem mamata,target
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


## Mutual Information

In [15]:
from sklearn.feature_selection import mutual_info_classif

bestfeatures_mi = mutual_info_classif(tfidf_df.iloc[:,:-1], tfidf_df.iloc[:,-1],discrete_features=True)

# Criação do Dataframe
dfscores_mi = pd.DataFrame(bestfeatures_mi)
dfcolumns_mi = pd.DataFrame(tfidf_df.iloc[:,:-1].columns)
featureScores_mi = pd.concat([dfcolumns_mi,dfscores_mi],axis=1)
featureScores_mi.columns = ['Atributo','Score']

tfidf_mi_df = tfidf_df.loc[:,featureScores_mi.sort_values('Score',ascending = False)[:5000]['Atributo']]
y = y.reset_index(drop=True)
tfidf_mi_df.insert(len(tfidf_mi_df.columns), 'target', y)
tfidf_mi_df

Unnamed: 0,.,",",!,bolsonaro,:,. .,-,*,. . .,brasil,...,jean wyllys,", valores",auditoria,. lembre,seres,resumo,mudanças,renan,entra,target
0,0.027419,0.068797,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.080295,0.033578,0.000000,0.000000,0.000000,0.106164,0.000000,0.000000,0.060376,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.048188,0.020152,0.000000,0.000000,0.029519,0.000000,0.000000,0.067064,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.051560,0.021562,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4583,0.089408,0.148222,0.000000,0.000000,0.000000,0.000000,0.031976,0.006666,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4584,0.133230,0.000000,0.000000,0.000000,0.048969,0.105693,0.000000,0.111251,0.060108,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4585,0.101339,0.000000,0.000000,0.055408,0.000000,0.133988,0.000000,0.000000,0.076200,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4586,0.077470,0.016199,0.000000,0.042357,0.047457,0.000000,0.000000,0.161724,0.000000,0.029637,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


# Modelos

## Comparação de modelos

### Mutual Information

In [16]:
from pycaret.classification import *

baseline_tfidf_01 = setup(data = tfidf_mi_df, target = 'target',use_gpu=True, numeric_features=tfidf_mi_df.columns.tolist()[:-1],
                         data_split_stratify=True)#,remove_perfect_collinearity=False) 

Unnamed: 0,Description,Value
0,session_id,2573
1,Target,target
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(4588, 5001)"
5,Missing Values,False
6,Numeric Features,5000
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [17]:
best_model_tfidf = compare_models(sort='F1')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.758,0.8321,0.6723,0.7579,0.7119,0.5046,0.508,3.523
svm,SVM - Linear Kernel,0.7412,0.0,0.6478,0.743,0.6875,0.4693,0.4763,1.67
rf,Random Forest Classifier,0.7487,0.8234,0.6072,0.7798,0.6823,0.4798,0.4904,2.219
ridge,Ridge Classifier,0.7325,0.0,0.5974,0.7533,0.6646,0.4472,0.4568,0.748
qda,Quadratic Discriminant Analysis,0.5338,0.5765,0.9629,0.4879,0.6476,0.1396,0.2327,6.511
nb,Naive Bayes,0.6758,0.6819,0.6506,0.6315,0.6406,0.3455,0.346,0.408
gbc,Gradient Boosting Classifier,0.7169,0.7882,0.5337,0.7601,0.6256,0.4093,0.4268,19.054
dt,Decision Tree Classifier,0.6711,0.665,0.6099,0.6365,0.6227,0.3315,0.3319,3.346
ada,Ada Boost Classifier,0.6889,0.7403,0.5505,0.6895,0.6111,0.3573,0.3645,4.72
lda,Linear Discriminant Analysis,0.6204,0.6194,0.6107,0.5687,0.5885,0.2371,0.2379,20.18


### Chi2

In [18]:
from pycaret.classification import *

baseline_tfidf_02 = setup(data = tfidf_chi_df, target = 'target',use_gpu=True, numeric_features=tfidf_chi_df.columns.tolist()[:-1],
                         data_split_stratify=True)#,remove_perfect_collinearity=False) 

Unnamed: 0,Description,Value
0,session_id,8420
1,Target,target
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(4588, 10001)"
5,Missing Values,False
6,Numeric Features,10000
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [19]:
best_model_tfidf = compare_models(sort='F1')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.8773,0.8778,0.8768,0.8522,0.864,0.7523,0.753,0.414
svm,SVM - Linear Kernel,0.7901,0.0,0.6387,0.8577,0.727,0.5635,0.5837,1.587
ridge,Ridge Classifier,0.7817,0.0,0.6289,0.8407,0.7185,0.5465,0.5625,0.751
et,Extra Trees Classifier,0.7786,0.8748,0.6079,0.8525,0.7092,0.5385,0.5589,3.343
rf,Random Forest Classifier,0.7646,0.8625,0.5658,0.8569,0.6809,0.5066,0.5345,2.217
qda,Quadratic Discriminant Analysis,0.6001,0.6303,0.902,0.5304,0.6675,0.2437,0.3032,7.106
gbc,Gradient Boosting Classifier,0.7303,0.7975,0.5623,0.7701,0.6494,0.4389,0.4536,16.776
dt,Decision Tree Classifier,0.696,0.6881,0.6163,0.6738,0.6427,0.3793,0.3814,2.235
ada,Ada Boost Classifier,0.6992,0.7455,0.556,0.7054,0.6211,0.3777,0.3855,4.519
lr,Logistic Regression,0.7023,0.8035,0.4677,0.7734,0.5819,0.3723,0.4012,0.864


In [20]:
SEED = 42

In [21]:
tuned_linearSVM = tune_model(linear_svm, optimize = 'F1', custom_grid = {'C':[0.521],'penalty':['l2'],'dual':[True],
                                                                         'intercept_scaling':[22],'class_weight':['balanced']},
                            search_algorithm='grid')#,n_iter=1000)

NameError: name 'linear_svm' is not defined

In [None]:
print(linear_svm)

In [None]:
print(tuned_linearSVM)

In [None]:
from sklearn.naive_bayes import ComplementNB

cNB = create_model(ComplementNB())

In [None]:
tuned_cNB = tune_model(cNB, optimize = 'F1', custom_grid = {'alpha':[0.668],'norm':[False]},
                            search_algorithm='grid')#,n_iter=1000)

In [None]:
print(cNB)

In [None]:
print(tuned_cNB)

In [None]:
from sklearn.naive_bayes import MultinomialNB

mNB = create_model(MultinomialNB())

In [None]:
tuned_mNB = tune_model(mNB, optimize = 'F1', custom_grid = {'alpha':[0.074]},
                            search_algorithm='grid')#,n_iter=1000)

In [None]:
print(mNB)

In [None]:
print(tuned_mNB)

In [None]:
lr = create_model('lr',n_jobs=-1,random_state=SEED)

In [None]:
tuned_lr = tune_model(lr, optimize = 'F1', custom_grid = {'solver':['liblinear'],'penalty':['l2'],'C':[3.83]},
                            search_algorithm='grid')#,n_iter=1000)

In [None]:
print(lr)

In [None]:
print(tuned_lr)

In [None]:
ridge = create_model('ridge',random_state=None)

In [None]:
tuned_ridge = tune_model(ridge, optimize = 'F1', custom_grid = {'aplpha':np.arange(1,101,1),'class_weight':[None,'balanced']},
                            search_algorithm='random',n_iter=2000)

In [None]:
et = create_model('et',random_state=SEED)

In [None]:
tuned_et = tune_model(et, optimize = 'F1', custom_grid = {'n_estimators':np.arange(10,1000,10),'criterion':['gini','entropy'],
                                                          'max_depth':np.arange(10,1000,10), 'min_samples_split':np.arange(2,200,2),
                                                         'min_samples_leaf':np.arange(1,100,1),'max_features':['auto','sqrt','log2'],
                                                         'max_leaf_nodes':np.arange(10,10000,10),'min_impurity_decrease':np.arange(0,10,0.1),
                                                         'bootstrap':[True,False],'oob_score':[True,False],'warm_start':[True,False],
                                                         'class_weight':[None,'balanced','balanced_subsample'],'ccp_alpha':np.arange(0,10,0.1),
                                                         'max_samples':np.arange(321,3211,100)},
                            search_algorithm='random',n_iter=2000)