In [1]:
from __future__ import print_function
from time import time

import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt

import numpy as np
import sklearn
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

%matplotlib inline

In [2]:
bag_df = pd.read_csv('bag_words.csv', sep=';', index_col=0)

In [3]:
bag_df.columns = ['link', 'categoria', 'texto']

In [4]:
bag_df.head()

Unnamed: 0,link,categoria,texto
0,http://g1.globo.com/economia/agronegocios/agro...,agro,criação peixes cativeiro brasil expansão país ...
1,http://g1.globo.com/economia/negocios/noticia/...,agro,vale anunciou manhã desta segundafeira venda a...
2,http://g1.globo.com/economia/agronegocios/noti...,agro,acordo ibge abate somou milhões cabeças maior ...
3,http://g1.globo.com/sp/piracicaba-regiao/notic...,agro,universidade paulo usp piracicabasp anunciou i...
4,http://g1.globo.com/economia/midia-e-marketing...,agro,mcdonalds saladas compostas vegetais orgânicos...


In [5]:
bag_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 705 entries, 0 to 704
Data columns (total 3 columns):
link         705 non-null object
categoria    705 non-null object
texto        694 non-null object
dtypes: object(3)
memory usage: 22.0+ KB


In [6]:
bag_df.describe()

Unnamed: 0,link,categoria,texto
count,705,705,694
unique,696,6,687
top,http://ciencia.estadao.com.br/blogs/herton-esc...,ciencia-e-saude,desafio manequim pessoas parkinson quer consci...
freq,2,146,2


In [7]:
bag_df_clean = bag_df.dropna()
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(
    bag_df_clean.texto, bag_df_clean.categoria
)

In [41]:
for Model in (RandomForestClassifier, LogisticRegression, MultinomialNB):
    for Vect in (CountVectorizer, TfidfVectorizer):
        vect = Vect()
        model = Model()
        pipe = Pipeline([('vect', vect), ('model', model)])
        
        scores = cross_val_score(pipe, bag_df_clean.texto, bag_df_clean.categoria, cv=5)
        print("Accuracy %s with %s: %0.2f (+/- %0.2f)" % (Model.__name__, Vect.__name__, scores.mean(), scores.std() * 2))

Accuracy RandomForestClassifier with CountVectorizer: 0.66 (+/- 0.06)
Accuracy RandomForestClassifier with TfidfVectorizer: 0.67 (+/- 0.08)
Accuracy LogisticRegression with CountVectorizer: 0.79 (+/- 0.11)
Accuracy LogisticRegression with TfidfVectorizer: 0.76 (+/- 0.12)
Accuracy MultinomialNB with CountVectorizer: 0.80 (+/- 0.15)
Accuracy MultinomialNB with TfidfVectorizer: 0.73 (+/- 0.10)


In [8]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [9]:
parameters = {
    'randomforestclassifier': {
        'model__n_estimators': (10, 50, 100, 150),
        'model__criterion': ('gini', 'entropy'),
        'model__max_features': ('auto', 'log2'),
        'model__max_depth': (None, 10, 100),
    },
    'logisticregression': {
        'model__penalty': ('l1', 'l2'),
        'model__C': (0.1, 0.5, 1.0, 2.0),
        'model__solver': ('newton-cg', 'lbfgs', 'liblinear', 'sag'),
    },
    'multinomialnb': {
        'model__alpha': (0.5, 1.0),
        'model__fit_prior': (True, False),
    },
    'countvectorizer':{
        'vect__analyzer': ('word', 'char_wb'),
        'vect__max_df': (0.6, 0.8, 1.0),
        'vect__min_df': (0, 0.2, 0.4),
        'vect__max_features': (100, 1000, 5000),
    },
    'tfidfvectorizer': {
        'vect__analyzer': ('word',),
        'vect__max_features': (100, 1000, 5000),
        'vect__max_df': (0.6, 0.8, 1.0),
        'vect__min_df': (0, 0.2, 0.4),
        'vect__norm': ('l1', 'l2'),
        'vect__use_idf': (True, False),
        'vect__smooth_idf': (True, False),
    }
}

In [10]:
for Model in (RandomForestClassifier, LogisticRegression, MultinomialNB):
    for Vect in (CountVectorizer, TfidfVectorizer):
        vect = Vect()
        model = Model()
        m_name = Model.__name__.lower()
        v_name = Vect.__name__.lower()
        params = parameters[m_name]
        params.update(parameters[v_name])
        
        pipe = Pipeline([('vect', vect), ('model', model)])
        grid_search = GridSearchCV(pipe, param_grid=params, n_jobs=1)
        start = time()
        grid_search.fit(bag_df_clean.texto, bag_df_clean.categoria)
        
        print("GridSearchCV with %s and %s took %.2f seconds for %d candidate parameter settings."
          % (Model.__name__, Vect.__name__, time() - start, len(grid_search.cv_results_['params'])))
        report(grid_search.cv_results_)
        print("")

KeyboardInterrupt: 