In [3]:
import pandas as pd
import numpy as np
import requests,nltk, re, json
from string import ascii_lowercase, punctuation
from unicodedata import normalize

from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA, NMF
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

import gc 
import pickle

In [4]:
df_noticias = pd.read_csv('noticias_all.csv')
df_noticias.shape

(16314, 2)

In [5]:
df_noticias.head()

Unnamed: 0,noticia,target
0,Carona com Deyverson: atacante do Palmeiras ab...,1
1,River x Boca: final da Libertadores em Madri t...,1
2,E-Brasileirão 2018\r\n\r\nAssista ao vivo à fa...,1
3,Inauguração de estátua de Renato Gaúcho no Grê...,1
4,Zagallo visita novo CT do Flamengo antes de in...,1


In [6]:
def remover_acentuacao(texto):
    return normalize('NFKD', texto).encode('ASCII', 'ignore').decode()


stopwords = requests.get('https://gist.githubusercontent.com/alopes/5358189/raw/2107d809cca6b83ce3d8e04dbd9463283025284f/stopwords.txt').text
stopwords = [remover_acentuacao(w) for w in stopwords.split()]
stopwords += ['ser', 'pode']

with open('app/data/stopwords.json', 'w') as file:
    json.dump(stopwords, file)

In [145]:
stemmer = nltk.stem.RSLPStemmer()

def processar_texto(texto):
    if texto is None or not texto:
        return ''

    # Trasnformação do texto em minúsculo e remoção de termo
    texto = str(texto).lower()

    # Removendo acentuação
    texto = remover_acentuacao(texto)

    # Removendo Pontuação, stopwords, palavras com número e aplicando stemming
    texto = ' '.join([c for c in nltk.word_tokenize(texto) 
                      if (c not in punctuation) 
                      and (c not in stopwords) 
                      and not (re.match(r'.*[\d_].*', c)) 
                      and len(c) > 2
                     ])

    return texto



def vetorizar_texto(textos, vectorizer=None):
    
    if not vectorizer:
        vectorizer = TfidfVectorizer(
            ngram_range=(1, 1),
            min_df=0.05
        )
        
        vectorizer = vectorizer.fit(textos)
    
    
    tfidf_matrix = vectorizer.transform(textos)    
    df_words = pd.DataFrame(tfidf_matrix.todense(), columns=vectorizer.get_feature_names())
    
    return df_words, vectorizer



def valida(model, x_train, y_train, x_test, y_test):
    
    grid = GridSearchCV(
        estimator=model['model'],
        param_grid=model['param'],
        scoring='roc_auc',
        n_jobs=-1
    )
    
    grid = grid.fit(x_train, y_train)
    predict = grid.predict(x_test)
    
    return grid, metrics.roc_auc_score(y_test, predict), predict

### Processamento do texto

In [8]:
df_noticias['doc'] = df_noticias['noticia'].apply(processar_texto)

In [19]:
df_words, vectorizer = vetorizar_texto(df_noticias['doc'])

In [20]:
df_words.shape

(16314, 264)

In [21]:
#nmf = NMF(n_components=100, random_state=1)
#nmf_w = nmf.fit_transform(df_words)

In [23]:
df_words.head()

Unnamed: 0,abert,acess,acontec,acord,afirm,agor,aind,ajud,alem,algum,...,vag,vai,val,vej,vem,venc,vend,vez,vitor,volt
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.48356,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058348
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.068283,0.0,0.0,0.079516,...,0.0,0.088978,0.0,0.0,0.0,0.0,0.0,0.0,0.103538,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.106171,...,0.0,0.118804,0.0,0.13459,0.0,0.0,0.0,0.0,0.0,0.0


In [142]:
x_train, x_test, y_train, y_test = train_test_split(
    df_words, 
    df_noticias['target'], 
    test_size=0.33, 
    random_state=1
)

In [150]:
models = {'xgb':{}, 'mlp':{}}

models['xgb']['model'] = XGBClassifier(
    random_state=1,
    n_jobs=-1
)

models['xgb']['param'] = {
    'n_estimators': [1, 2, 3, 4, 5, 6, 7, 8],
    'gamma':[0, 0.1, 0.2, 0.3, 0.4, 0.5],
    'max_depth': [3, 4, 5, 6, 7, 8],
    'booster':['gbtree', 'gblinear', 'dart']
}


models['mlp']['model'] = MLPClassifier(
    random_state = 1
)

models['mlp']['param'] = {
    'hidden_layer_sizes': [(10, 20 , 30), (30, 20 , 10), (20, 30), (30, 20)],
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'solver': ['lbfgs', 'sgd', 'adam'],
}


### XGBoost

In [56]:
grid, score, pred = valida(models['xgb'], x_train, y_train, x_test, y_test)

  if diff:


In [58]:
grid.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0.1, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=4,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)

In [59]:
grid.best_params_

{'booster': 'gbtree', 'gamma': 0.1, 'max_depth': 5, 'n_estimators': 4}

In [60]:
grid.best_score_

0.9691954022988506

In [61]:
score

0.9591406803244842

In [65]:
pred

array([1, 1, 0, ..., 0, 0, 1], dtype=int64)

### MLP Classifier

In [70]:
grid, score, pred = valida(models['mlp'], x_train, y_train, x_test, y_test)

In [71]:
grid.best_estimator_

MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(30, 20, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [77]:
grid.grid_scores_



[mean: 0.98805, std: 0.00362, params: {'activation': 'identity', 'hidden_layer_sizes': (10, 20, 30), 'solver': 'lbfgs'},
 mean: 0.98881, std: 0.00250, params: {'activation': 'identity', 'hidden_layer_sizes': (10, 20, 30), 'solver': 'sgd'},
 mean: 0.98728, std: 0.00244, params: {'activation': 'identity', 'hidden_layer_sizes': (10, 20, 30), 'solver': 'adam'},
 mean: 0.98789, std: 0.00277, params: {'activation': 'identity', 'hidden_layer_sizes': (30, 20, 10), 'solver': 'lbfgs'},
 mean: 0.98743, std: 0.00207, params: {'activation': 'identity', 'hidden_layer_sizes': (30, 20, 10), 'solver': 'sgd'},
 mean: 0.98713, std: 0.00358, params: {'activation': 'identity', 'hidden_layer_sizes': (30, 20, 10), 'solver': 'adam'},
 mean: 0.98774, std: 0.00319, params: {'activation': 'identity', 'hidden_layer_sizes': (20, 30), 'solver': 'lbfgs'},
 mean: 0.98636, std: 0.00169, params: {'activation': 'identity', 'hidden_layer_sizes': (20, 30), 'solver': 'sgd'},
 mean: 0.98667, std: 0.00358, params: {'activati

In [73]:
grid.best_score_

0.9897318007662835

In [74]:
score

0.9863201214620978

In [75]:
predict_proba = grid.best_estimator_.predict_proba(x_test)

In [76]:
predict_proba

array([[0.00626   , 0.99374   ],
       [0.00630827, 0.99369173],
       [0.9983204 , 0.0016796 ],
       ...,
       [0.99832031, 0.00167969],
       [0.99829651, 0.00170349],
       [0.00630133, 0.99369867]])

## Sem aplicar Stemming

### Processamento do texto

In [80]:
df_noticias['doc2'] = df_noticias['noticia'].apply(processar_texto)

In [146]:
df_words, vectorizer = vetorizar_texto(df_noticias['doc2'])

In [147]:
df_words.shape

(16314, 492)

In [148]:
x_train, x_test, y_train, y_test = train_test_split(
    df_words, 
    df_noticias['target'], 
    test_size=0.33, 
    random_state=1
)

### XGBoost

In [225]:
grid, score, pred = valida(models['xgb'], x_train, y_train, x_test, y_test)

  if diff:


In [226]:
grid.best_estimator_

XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=1,
       colsample_bytree=1, gamma=0.4, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=8,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)

In [227]:
grid.best_params_

{'booster': 'gblinear', 'gamma': 0.4, 'max_depth': 3, 'n_estimators': 8}

In [228]:
grid.best_score_

0.9984901134619247

In [229]:
scores = grid.grid_scores_
scores.sort(key=lambda x: x[1], reverse=True)
scores[:40]



[mean: 0.99849, std: 0.00006, params: {'booster': 'gblinear', 'gamma': 0.4, 'max_depth': 3, 'n_estimators': 8},
 mean: 0.99849, std: 0.00006, params: {'booster': 'gblinear', 'gamma': 0.1, 'max_depth': 3, 'n_estimators': 8},
 mean: 0.99849, std: 0.00006, params: {'booster': 'gblinear', 'gamma': 0.2, 'max_depth': 3, 'n_estimators': 8},
 mean: 0.99849, std: 0.00006, params: {'booster': 'gblinear', 'gamma': 0.2, 'max_depth': 4, 'n_estimators': 8},
 mean: 0.99849, std: 0.00006, params: {'booster': 'gblinear', 'gamma': 0.2, 'max_depth': 6, 'n_estimators': 8},
 mean: 0.99849, std: 0.00006, params: {'booster': 'gblinear', 'gamma': 0.5, 'max_depth': 3, 'n_estimators': 8},
 mean: 0.99849, std: 0.00006, params: {'booster': 'gblinear', 'gamma': 0.1, 'max_depth': 7, 'n_estimators': 8},
 mean: 0.99849, std: 0.00006, params: {'booster': 'gblinear', 'gamma': 0.3, 'max_depth': 6, 'n_estimators': 8},
 mean: 0.99849, std: 0.00006, params: {'booster': 'gblinear', 'gamma': 0, 'max_depth': 8, 'n_estimators'

In [230]:
score

0.5

In [231]:
pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

### MLP Classifier

In [149]:
grid, score, pred = valida(models['mlp'], x_train, y_train, x_test, y_test)

In [151]:
grid.best_estimator_

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(20, 30), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [152]:
grid.best_params_

{'activation': 'relu', 'hidden_layer_sizes': (20, 30), 'solver': 'adam'}

In [153]:
scores = grid.grid_scores_
scores.sort(key=lambda x: x[1], reverse=True)
scores[:20]



[mean: 0.99973, std: 0.00007, params: {'activation': 'relu', 'hidden_layer_sizes': (20, 30), 'solver': 'adam'},
 mean: 0.99972, std: 0.00008, params: {'activation': 'relu', 'hidden_layer_sizes': (30, 20), 'solver': 'adam'},
 mean: 0.99970, std: 0.00006, params: {'activation': 'relu', 'hidden_layer_sizes': (30, 20, 10), 'solver': 'adam'},
 mean: 0.99970, std: 0.00005, params: {'activation': 'relu', 'hidden_layer_sizes': (10, 20, 30), 'solver': 'adam'},
 mean: 0.99970, std: 0.00006, params: {'activation': 'logistic', 'hidden_layer_sizes': (20, 30), 'solver': 'adam'},
 mean: 0.99969, std: 0.00006, params: {'activation': 'logistic', 'hidden_layer_sizes': (30, 20), 'solver': 'adam'},
 mean: 0.99969, std: 0.00004, params: {'activation': 'logistic', 'hidden_layer_sizes': (10, 20, 30), 'solver': 'adam'},
 mean: 0.99969, std: 0.00006, params: {'activation': 'logistic', 'hidden_layer_sizes': (30, 20, 10), 'solver': 'adam'},
 mean: 0.99968, std: 0.00008, params: {'activation': 'tanh', 'hidden_lay

In [154]:
grid.best_score_

0.9997282346559676

In [155]:
score

0.9914073494414334

In [156]:
pred

array([1, 1, 0, ..., 0, 0, 0], dtype=int64)

In [169]:
predict_proba = grid.best_estimator_.predict_proba(x_test)

In [170]:
predict_proba

array([[1.87062571e-05, 9.99981294e-01],
       [6.72742732e-05, 9.99932726e-01],
       [9.99999345e-01, 6.54623952e-07],
       ...,
       [9.99999904e-01, 9.60787392e-08],
       [9.99388052e-01, 6.11947637e-04],
       [1.00000000e+00, 9.42578874e-11]])

In [182]:
predict_proba.tolist()[-1]

[0.9999999999057421, 9.425788739839954e-11]

In [220]:
float_formatter = lambda x: "%.7f" % x

In [224]:
float_formatter(predict_proba[-2][1])

'0.0006119'