# Modelo

In [1]:
nomemodelo = 'Linear SVM'

## Caminhos

In [2]:
import os

#Folder Inicial
path = os.getcwd()

#Subpastas
pathin = path + '\\Entrada\\'
pathfixo = path + '\\Fixo\\'
pathout = path + '\\Saida\\'
pathparcial = path + '\\Parcial\\'
pathaux = path + '\\Auxiliar\\'

## Pacotes

In [3]:
import dill
import pickle
import pandas as pd
pd.set_option('max_colwidth', 3000)

import numpy as np

from time import gmtime, strftime

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
import lightgbm as lgb

from sklearn.externals import joblib

from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.utils import parallel_backend

In [4]:
import sklearn
print(sklearn.__version__)

0.20.3


## Lendo a Base

In [5]:
file = 'Treino'
treino = pd.read_pickle(pathparcial + 'Arquivo0 ' + file + '.pkl')

file = 'Teste'
teste = pd.read_pickle(pathparcial + 'Arquivo0 ' + file + '.pkl')

print(treino.shape)
print(teste.shape)

(750, 58)
(250, 58)


In [6]:
#Excluindo variaveis que nao serao usadas no modelo
y = 'default'
id = 'id'

Xtr = treino.drop([y, id], axis = 1)
colunas = list(Xtr)

with open(pathaux + 'Variaveis Modelo ' + nomemodelo + '.pickle', 'wb') as f:
    dill.dump((colunas, y), f)

Xte = teste[colunas]

print(Xtr.shape)
print(Xte.shape)

(750, 56)
(250, 56)


## Modelos

In [7]:
nomemodelo = 'Linear SVM'

seed = 123

parameters = {
    #'clf__max_iter': [2000],
    'C': [.01, .1, 1, 10],
    'penalty': ['l2'],
    'fit_intercept': [True],
    'class_weight': ['balanced', None],
    'random_state': [seed]
}

estimator = LinearSVC()

cv = RepeatedStratifiedKFold(n_splits = 3, n_repeats = 1, random_state = seed)

gs = GridSearchCV(estimator = estimator, param_grid = parameters, cv = cv, scoring = 'f1_macro', 
                  n_jobs = 3, verbose = 1, refit = True)

In [8]:
#Rodando GridSearch
print(strftime('%Y-%m-%d %H:%M:%S', gmtime()))

with parallel_backend('multiprocessing'):
    gs.fit(Xtr, treino[y])

joblib.dump(gs, pathaux + 'Modelo ' + nomemodelo + '.pkl')

print(strftime('%Y-%m-%d %H:%M:%S', gmtime()))

2019-05-12 01:40:50
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=3)]: Using backend MultiprocessingBackend with 3 concurrent workers.


2019-05-12 01:40:59


[Parallel(n_jobs=3)]: Done  24 out of  24 | elapsed:    8.5s finished


In [9]:
gs.best_score_

0.6998301880506952

In [10]:
gs.best_params_

{'C': 0.1,
 'class_weight': 'balanced',
 'fit_intercept': True,
 'penalty': 'l2',
 'random_state': 123}

In [11]:
cvresults = pd.DataFrame(gs.cv_results_)[[
 'mean_test_score',
 'mean_train_score',
 'param_C',
 'param_penalty',
 'param_fit_intercept',
 'param_class_weight',
 'std_test_score',
 'std_train_score']]
ha = list(cvresults)
ha = [w.replace('param_', '') for w in ha]
cvresults.columns = ha
cvresults.to_excel(pathout + 'Modelo ' + nomemodelo + ' Resultados GridSearch.xlsx', encoding = 'latin1', index = False)



In [12]:
cvresults

Unnamed: 0,mean_test_score,mean_train_score,C,penalty,fit_intercept,class_weight,std_test_score,std_train_score
0,0.695586,0.727624,0.01,l2,True,balanced,0.019451,0.013017
1,0.647596,0.694623,0.01,l2,True,,0.024704,0.018519
2,0.69983,0.740016,0.1,l2,True,balanced,0.019543,0.00759
3,0.654361,0.727531,0.1,l2,True,,0.004032,0.014526
4,0.690007,0.739208,1.0,l2,True,balanced,0.018238,0.010595
5,0.654064,0.729398,1.0,l2,True,,0.003503,0.016292
6,0.692674,0.745382,10.0,l2,True,balanced,0.019279,0.007251
7,0.666119,0.72235,10.0,l2,True,,0.011068,0.013394


## Variaveis Importantes

In [13]:
def Feature_Importance(mod, nomemodelo, X, y):
    many = ['Linear SVM', 'Logistica', 'Naive Bayes']

    k = mod.best_estimator_
    featurenames = list(X)
    
    if nomemodelo in many:
        #Logistica, SVM, Naive Bayes
        Features = pd.DataFrame(k.coef_.tolist())
        Features.columns = featurenames
        if len(y.unique()) > 2:
            Features.index = sorted(y.unique())
    else:
        #Arvores
        Features = pd.DataFrame({'features': featurenames, 'value': k.feature_importances_.tolist()})
        
    Features.to_excel(pathout + 'Features ' + nomemodelo + '.xlsx', encoding = 'latin1', index = True)
    
Feature_Importance(mod = gs, nomemodelo = nomemodelo, X = Xtr, y = treino[y])

## Preditos

In [14]:
predtr = gs.best_estimator_.predict(Xtr)
predte = gs.best_estimator_.predict(Xte)

In [15]:
name = 'Treino'
fim = treino[[id, y]]
fim['pred'] = predtr

fim['Acertou'] = np.where(fim[y] == fim['pred'], 1, 0)
fim.to_excel(pathout + 'Modelo ' + nomemodelo + ' Pred ' + name + '.xlsx', encoding = 'latin1', index = False)

del fim

name = 'Teste'
fim = teste[[id, y]]
fim['pred'] = predte

fim['Acertou'] = np.where(fim[y] == fim['pred'], 1, 0)
fim.to_excel(pathout + 'Modelo ' + nomemodelo + ' Pred ' + name + '.xlsx', encoding = 'latin1', index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the doc