In [1]:
import pandas as pd
import numpy as np
import requests, nltk, re, json, pickle
from string import ascii_lowercase, punctuation
from unicodedata import normalize

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

from sklearn.neural_network import MLPClassifier


In [2]:
df_noticias = pd.read_csv('noticias_all.csv')
df_noticias.shape

(16314, 2)

In [3]:
def remover_acentuacao(texto):
    return normalize('NFKD', texto).encode('ASCII', 'ignore').decode()


stopwords = requests.get('https://gist.githubusercontent.com/alopes/5358189/raw/2107d809cca6b83ce3d8e04dbd9463283025284f/stopwords.txt').text
stopwords = [remover_acentuacao(w) for w in stopwords.split()]
stopwords += ['ser', 'pode']

with open('app/data/stopwords.json', 'w') as file:
    json.dump(stopwords, file)

In [4]:

def processar_texto(texto):
    if texto is None or not texto:
        return ''

    # Trasnformação do texto em minúsculo e remoção de termo
    texto = str(texto).lower()

    # Removendo acentuação
    texto = remover_acentuacao(texto)

    # Removendo Pontuação, stopwords, palavras com número e aplicando stemming
    texto = ' '.join([c for c in nltk.word_tokenize(texto) 
                      if (c not in punctuation) 
                      and (c not in stopwords) 
                      and not (re.match(r'.*[\d_].*', c)) 
                      and len(c) > 2
                     ])

    return texto



def vetorizar_texto(textos, vectorizer=None):
    
    if not vectorizer:
        vectorizer = TfidfVectorizer(
            min_df=0.0002
        )
        
        vectorizer = vectorizer.fit(textos)
    
    
    tfidf_matrix = vectorizer.transform(textos)    
    df_words = pd.DataFrame(tfidf_matrix.todense(), columns=vectorizer.get_feature_names())
    
    return df_words, vectorizer

### Processamento do Texto

In [5]:
df_noticias['doc'] = df_noticias['noticia'].apply(processar_texto)

In [6]:
df_words, vectorizer = vetorizar_texto(df_noticias['doc'])
df_words.shape

(16314, 33747)

In [7]:
model = MLPClassifier(
    hidden_layer_sizes=(20, 30), 
    activation='relu', 
    solver='adam'
)

In [8]:
model.fit(df_words, df_noticias['target'])

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(20, 30), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [9]:
with open('app/data/vectorizer', 'wb') as file:
    pickle.dump(vectorizer, file)

In [10]:
with open('app/data/model', 'wb') as file:
    pickle.dump(model, file)

In [65]:
df_words.to_csv('app/data/words.csv', index=False)