# Modelo

In [1]:
nomemodelo = 'Random Forest'

## Caminhos

In [2]:
import os

#Folder Inicial
path = os.getcwd()

#Subpastas
pathin = path + '\\Entrada\\'
pathfixo = path + '\\Fixo\\'
pathout = path + '\\Saida\\'
pathparcial = path + '\\Parcial\\'
pathaux = path + '\\Auxiliar\\'

## Pacotes

In [3]:
import dill
import pickle
import pandas as pd
pd.set_option('max_colwidth', 3000)

import numpy as np

from time import gmtime, strftime

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
import lightgbm as lgb

from sklearn.externals import joblib

from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.utils import parallel_backend

In [4]:
import sklearn
print(sklearn.__version__)

0.20.3


## Lendo a Base

In [5]:
file = 'Treino'
treino = pd.read_pickle(pathparcial + 'Arquivo1 ' + file + '.pkl')

file = 'Teste'
teste = pd.read_pickle(pathparcial + 'Arquivo1 ' + file + '.pkl')

print(treino.shape)
print(teste.shape)

(10877, 11)
(3626, 11)


In [6]:
#Excluindo variaveis que nao serao usadas no modelo
y = 'respostafinal'
Xtr = treino[['texto', 'length', 'words', 'avg_word_length', 'min_word_length', 'max_word_length']]
colunas = list(Xtr)

with open(pathaux + 'Variaveis Modelo ' + nomemodelo + '.pickle', 'wb') as f:
    dill.dump((colunas, y), f)

Xte = teste[colunas]

print(Xtr.shape)
print(Xte.shape)

(10877, 6)
(3626, 6)


## Modelos

In [7]:
seed = 123

parameters = {
    'transformer__tfidf__max_features': [1000],
    #'transformer__tfidf__ngram_range': [(1, 1), (1, 2)],
    'transformer__tfidf__ngram_range': [(1, 1)],
    'transformer__tfidf__max_df': [.6],
    'transformer__tfidf__min_df': [20],
    'clf__n_estimators': [100, 300, 500, 700],
    'clf__min_samples_leaf': [5, 10],
    'clf__class_weight': ['balanced', None],
    'clf__random_state': [seed]
}

pipeline = Pipeline([
    ('transformer', ColumnTransformer([('tfidf', TfidfVectorizer(analyzer = 'word'), 'texto')], remainder = 'passthrough')),
    ('clf', RandomForestClassifier() )
])

cv = RepeatedStratifiedKFold(n_splits = 3, n_repeats = 1, random_state = seed)
gs = GridSearchCV(pipeline, parameters, cv = cv, scoring = 'f1_macro', n_jobs = 3, verbose = 1, refit = True)

In [8]:
#Rodando GridSearch
print(strftime('%Y-%m-%d %H:%M:%S', gmtime()))

with parallel_backend('multiprocessing'):
    gs.fit(Xtr, treino[y])

joblib.dump(gs, pathaux + 'Modelo ' + nomemodelo + '.pkl')

print(strftime('%Y-%m-%d %H:%M:%S', gmtime()))

2019-05-12 01:32:33
Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=3)]: Using backend MultiprocessingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  48 out of  48 | elapsed:  1.9min finished


2019-05-12 01:34:36


In [9]:
gs.best_score_

0.42562697847331893

In [10]:
gs.best_params_

{'clf__class_weight': 'balanced',
 'clf__min_samples_leaf': 5,
 'clf__n_estimators': 300,
 'clf__random_state': 123,
 'transformer__tfidf__max_df': 0.6,
 'transformer__tfidf__max_features': 1000,
 'transformer__tfidf__min_df': 20,
 'transformer__tfidf__ngram_range': (1, 1)}

In [11]:
cvresults = pd.DataFrame(gs.cv_results_)[[
 'mean_test_score',
 'mean_train_score',                                      
 'param_clf__n_estimators',
 'param_clf__min_samples_leaf',
 'param_clf__class_weight',                                          
 'param_clf__class_weight',
 'param_transformer__tfidf__max_df',
 'param_transformer__tfidf__max_features',
 'param_transformer__tfidf__min_df',
 'param_transformer__tfidf__ngram_range',
 'std_test_score',
 'std_train_score']]
ha = list(cvresults)
ha = [w.replace('param_clf__', '') for w in ha]
ha = [w.replace('param_transformer__', '') for w in ha]
cvresults.columns = ha
cvresults.to_excel(pathout + 'Modelo ' + nomemodelo + ' Resultados GridSearch.xlsx', encoding = 'latin1', index = False)



In [12]:
cvresults

Unnamed: 0,mean_test_score,mean_train_score,n_estimators,min_samples_leaf,class_weight,class_weight.1,tfidf__max_df,tfidf__max_features,tfidf__min_df,tfidf__ngram_range,std_test_score,std_train_score
0,0.42495,0.475376,100,5,balanced,balanced,0.6,1000,20,"(1, 1)",0.017404,0.007575
1,0.425627,0.476397,300,5,balanced,balanced,0.6,1000,20,"(1, 1)",0.018132,0.007969
2,0.424411,0.476042,500,5,balanced,balanced,0.6,1000,20,"(1, 1)",0.016893,0.006994
3,0.424938,0.475624,700,5,balanced,balanced,0.6,1000,20,"(1, 1)",0.016532,0.00747
4,0.417724,0.449498,100,10,balanced,balanced,0.6,1000,20,"(1, 1)",0.015969,0.00573
5,0.419975,0.449625,300,10,balanced,balanced,0.6,1000,20,"(1, 1)",0.017581,0.00626
6,0.420953,0.450862,500,10,balanced,balanced,0.6,1000,20,"(1, 1)",0.016981,0.005501
7,0.421257,0.450734,700,10,balanced,balanced,0.6,1000,20,"(1, 1)",0.017268,0.005583
8,0.380754,0.403327,100,5,,,0.6,1000,20,"(1, 1)",0.007002,0.011779
9,0.378083,0.40233,300,5,,,0.6,1000,20,"(1, 1)",0.008852,0.011143


## Variaveis Importantes

In [13]:
def feature_names_tfidf(X, column_tfidf, **params_tfidf):
    tfidf = TfidfVectorizer(**params_tfidf)
    tfidf.fit(X[column_tfidf])
    tfidf_feature_names = {'tdidf__' + x for x in tfidf.get_feature_names()}
    feature_names = list(tfidf_feature_names) + list(X.columns.drop(column_tfidf))
    return feature_names

In [14]:
featurenames = feature_names_tfidf(X = Xtr, column_tfidf = 'texto',
    max_df = list(gs.best_params_.values())[list(gs.best_params_).index('transformer__tfidf__max_df')],
    min_df = list(gs.best_params_.values())[list(gs.best_params_).index('transformer__tfidf__min_df')],
    max_features = list(gs.best_params_.values())[list(gs.best_params_).index('transformer__tfidf__max_features')],
    ngram_range = list(gs.best_params_.values())[list(gs.best_params_).index('transformer__tfidf__ngram_range')]
)

In [15]:
def Feature_Importance(mod, nomemodelo, X, y, featurenames):
    many = ['Linear SVM', 'Logistica', 'Naive Bayes']

    k = mod.best_estimator_.named_steps['clf']

    if nomemodelo in many:
        #Logistica, SVM, Naive Bayes
        Features = pd.DataFrame(k.coef_.tolist())
        Features.columns = featurenames
        if len(y.unique()) > 2:
            Features.index = sorted(y.unique())
    else:
        #Arvores
        Features = pd.DataFrame({'features': featurenames, 'value': k.feature_importances_.tolist()})
        
    Features.to_excel(pathout + 'Features ' + nomemodelo + '.xlsx', encoding = 'latin1', index = True)
    
Feature_Importance(mod = gs, nomemodelo = nomemodelo, X = Xtr, y = treino[y], featurenames = featurenames)

## Preditos

In [16]:
predtr = gs.best_estimator_.predict(Xtr)
predte = gs.best_estimator_.predict(Xte)

In [17]:
name = 'Treino'
fim = treino[['ido', 'id', y]]
fim['pred'] = predtr

fim['Acertou'] = np.where(fim[y] == fim['pred'], 1, 0)
fim.to_excel(pathout + 'Modelo ' + nomemodelo + ' Pred ' + name + '.xlsx', encoding = 'latin1', index = False)

del fim

name = 'Teste'
fim = teste[['ido', 'id', y]]
fim['pred'] = predte

fim['Acertou'] = np.where(fim[y] == fim['pred'], 1, 0)
fim.to_excel(pathout + 'Modelo ' + nomemodelo + ' Pred ' + name + '.xlsx', encoding = 'latin1', index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the doc