In [60]:
import pickle, json, nltk, re
import pandas as pd
from string import ascii_lowercase, punctuation
from unicodedata import normalize


In [178]:
with open('app/data/vectorizer', 'rb') as file:
    vectorizer = pickle.load(file)

In [179]:
with open('app/data/model', 'rb') as file:
    model_mpl = pickle.load(file)

In [180]:
with open('app/data/stopwords.json', 'r') as file:
    stopwords = json.load(file)

In [8]:
df_words = pd.read_csv('app/data/words.csv')

In [189]:

def remover_acentuacao(texto):
    return normalize('NFKD', texto).encode('ASCII', 'ignore').decode()


def processar_texto(texto):
    if texto is None or not texto:
        return ''

    # Trasnformação do texto em minúsculo e remoção de termo
    texto = str(texto).lower()

    # Removendo acentuação
    texto = remover_acentuacao(texto)

    # Removendo Pontuação, stopwords, palavras com número e aplicando stemming
    texto = ' '.join([c for c in nltk.word_tokenize(texto) 
                      if (c not in punctuation) 
                      and (c not in stopwords) 
                      and not (re.match(r'.*[\d_].*', c)) 
                      and len(c) > 2
                     ])

    return texto



def vetorizar_textos(textos):
    tfidf_matrix = vectorizer.transform(textos)    
    df_words = pd.DataFrame(tfidf_matrix.todense(), columns=vectorizer.get_feature_names())
    
    return df_words



def predict_notices(notices):
    
    if isinstance(notices, str):
        notices = [notices]
        
    if isinstance(notices, pd.Series):
        notices = list(notices)
        
    df_notices = pd.DataFrame(notices, columns=['n'])
    df_notices = df_notices['n'].apply(processar_texto)
                              
    df_words_not = vetorizar_textos(df_notices)
    
    predict = model_mpl.predict(df_words_not)
    predict_proba = model_mpl.predict_proba(df_words_not)
    
    return predict, predict_proba, df_words_not

def obter_porcentagem(predict_proba):
    return [(float('%.3f'%p[0]), float('%.3f'%p[1])) for p in predict_proba]

In [90]:
noticias_teste = pd.read_excel('app/data/teste.xlsx')

In [190]:
pred, proba, _ = predict_notices(noticias_teste['noticias'])

In [191]:
df_words.shape

(16314, 492)

In [192]:
obter_porcentagem(proba)

[(0.051, 0.949),
 (0.189, 0.811),
 (0.955, 0.045),
 (0.001, 0.999),
 (0.0, 1.0),
 (0.0, 1.0)]

In [181]:
pred, proba, _ = predict_notices(noticias_teste['noticias'])

In [182]:
_.shape

(6, 33747)

In [183]:
pred

array([1, 1, 0, 1, 1, 1], dtype=int64)

In [184]:
obter_porcentagem(proba)

[('0.051', '0.949'),
 ('0.189', '0.811'),
 ('0.955', '0.045'),
 ('0.001', '0.999'),
 ('0.000', '1.000'),
 ('0.000', '1.000')]

In [99]:
df_noticias = pd.read_csv('app/data/noticias_all.csv')

In [125]:
pred, proba, _= predict_notices(df_noticias.loc[[1, 2, 1000, 7500, 9500, 10500]]['noticia'])

In [126]:
pred

array([1, 1, 1, 0, 0, 0], dtype=int64)

In [123]:
_

0    agentes divisao homicidios policia civil ruas ...
1    presidente eleito jair bolsonaro psl veio paul...
2    parte carga carnes desviada pms alvos operacao...
3    jorge sampaoli treinador argentina copa russia...
4    brasil segue fazendo historia neve nesta quint...
5    intermedio agora executivo futebol renato sant...
Name: n, dtype: object