# Modelo

In [1]:
nomemodelo = 'Light GBM'

## Caminhos

In [2]:
import os

#Folder Inicial
path = os.getcwd()

#Subpastas
pathin = path + '\\Entrada\\'
pathfixo = path + '\\Fixo\\'
pathout = path + '\\Saida\\'
pathparcial = path + '\\Parcial\\'
pathaux = path + '\\Auxiliar\\'

## Pacotes

In [3]:
import dill
import pickle
import pandas as pd
pd.set_option('max_colwidth', 3000)

import numpy as np

from time import gmtime, strftime

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
import lightgbm as lgb

from sklearn.externals import joblib

from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.utils import parallel_backend

In [4]:
import sklearn
print(sklearn.__version__)

0.20.3


## Lendo a Base

In [5]:
file = 'Treino'
treino = pd.read_pickle(pathparcial + 'Arquivo0 ' + file + '.pkl')

file = 'Teste'
teste = pd.read_pickle(pathparcial + 'Arquivo0 ' + file + '.pkl')

print(treino.shape)
print(teste.shape)

(750, 58)
(250, 58)


In [6]:
#Excluindo variaveis que nao serao usadas no modelo
y = 'default'
id = 'id'

Xtr = treino.drop([y, id], axis = 1)
colunas = list(Xtr)

with open(pathaux + 'Variaveis Modelo ' + nomemodelo + '.pickle', 'wb') as f:
    dill.dump((colunas, y), f)

Xte = teste[colunas]

print(Xtr.shape)
print(Xte.shape)

(750, 56)
(250, 56)


## Modelos

In [7]:
nomemodelo = 'Light GBM'

seed = 123

parameters = {
    'max_depth': [4, 5],
    'bagging_fraction': [.7],
    'min_data_leaf': [20],
    'feature_fraction': [.7,],
    'boosting_type': ['gbdt'],
    'num_boost_round': [300, 500, 700],
    'learning_rate': [.01],
    'min_child_smaples': [20],
    'class_weight': ['balanced', None],
    'random_state': [seed]
}

estimator = lgb.LGBMClassifier(seed = seed)

cv = RepeatedStratifiedKFold(n_splits = 3, n_repeats = 1, random_state = seed)

gs = GridSearchCV(estimator = estimator, param_grid = parameters, cv = cv, scoring = 'f1_macro', 
                  n_jobs = 3, verbose = 1, refit = True)

In [8]:
#Rodando GridSearch
print(strftime('%Y-%m-%d %H:%M:%S', gmtime()))

with parallel_backend('multiprocessing'):
    gs.fit(Xtr, treino[y])

joblib.dump(gs, pathaux + 'Modelo ' + nomemodelo + '.pkl')

print(strftime('%Y-%m-%d %H:%M:%S', gmtime()))

2019-05-12 01:40:46
Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=3)]: Using backend MultiprocessingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  36 out of  36 | elapsed:   13.2s finished


2019-05-12 01:41:00


In [9]:
gs.best_score_

0.7037090728864359

In [10]:
gs.best_params_

{'bagging_fraction': 0.7,
 'boosting_type': 'gbdt',
 'class_weight': None,
 'feature_fraction': 0.7,
 'learning_rate': 0.01,
 'max_depth': 5,
 'min_child_smaples': 20,
 'min_data_leaf': 20,
 'num_boost_round': 500,
 'random_state': 123}

In [11]:
cvresults = pd.DataFrame(gs.cv_results_)[[
 'mean_test_score',
 'mean_train_score',
 'param_max_depth',
 'param_bagging_fraction',
 'param_min_data_leaf',
 'param_feature_fraction',
 'param_boosting_type',
 'param_num_boost_round',
 'param_learning_rate',
 'param_min_child_smaples',
 'param_class_weight',
 'std_test_score',
 'std_train_score']]
ha = list(cvresults)
ha = [w.replace('param_', '') for w in ha]
cvresults.columns = ha
cvresults.to_excel(pathout + 'Modelo ' + nomemodelo + ' Resultados GridSearch.xlsx', encoding = 'latin1', index = False)



In [12]:
cvresults

Unnamed: 0,mean_test_score,mean_train_score,max_depth,bagging_fraction,min_data_leaf,feature_fraction,boosting_type,num_boost_round,learning_rate,min_child_smaples,class_weight,std_test_score,std_train_score
0,0.665104,0.791616,4,0.7,20,0.7,gbdt,300,0.01,20,balanced,0.009623,0.016967
1,0.687258,0.838268,4,0.7,20,0.7,gbdt,500,0.01,20,balanced,0.016595,0.004936
2,0.697053,0.865641,4,0.7,20,0.7,gbdt,700,0.01,20,balanced,0.021541,0.00653
3,0.673695,0.822748,5,0.7,20,0.7,gbdt,300,0.01,20,balanced,0.015081,0.018366
4,0.699821,0.866127,5,0.7,20,0.7,gbdt,500,0.01,20,balanced,0.016228,0.000694
5,0.702708,0.899119,5,0.7,20,0.7,gbdt,700,0.01,20,balanced,0.010361,0.009486
6,0.669963,0.791664,4,0.7,20,0.7,gbdt,300,0.01,20,,0.003336,0.002636
7,0.69715,0.837987,4,0.7,20,0.7,gbdt,500,0.01,20,,0.019199,0.013664
8,0.68499,0.86421,4,0.7,20,0.7,gbdt,700,0.01,20,,0.027419,0.018628
9,0.687023,0.8165,5,0.7,20,0.7,gbdt,300,0.01,20,,0.011972,0.013548


## Variaveis Importantes

In [13]:
def Feature_Importance(mod, nomemodelo, X, y):
    many = ['Linear SVM', 'Logistica', 'Naive Bayes']

    k = mod.best_estimator_
    featurenames = list(X)
    
    if nomemodelo in many:
        #Logistica, SVM, Naive Bayes
        Features = pd.DataFrame(k.coef_.tolist())
        Features.columns = featurenames
        if len(y.unique()) > 2:
            Features.index = sorted(y.unique())
    else:
        #Arvores
        Features = pd.DataFrame({'features': featurenames, 'value': k.feature_importances_.tolist()})
        
    Features.to_excel(pathout + 'Features ' + nomemodelo + '.xlsx', encoding = 'latin1', index = True)
    
Feature_Importance(mod = gs, nomemodelo = nomemodelo, X = Xtr, y = treino[y])

## Preditos

In [14]:
predtr = gs.best_estimator_.predict(Xtr)
predte = gs.best_estimator_.predict(Xte)

In [15]:
name = 'Treino'
fim = treino[[id, y]]
fim['pred'] = predtr

fim['Acertou'] = np.where(fim[y] == fim['pred'], 1, 0)
fim.to_excel(pathout + 'Modelo ' + nomemodelo + ' Pred ' + name + '.xlsx', encoding = 'latin1', index = False)

del fim

name = 'Teste'
fim = teste[[id, y]]
fim['pred'] = predte

fim['Acertou'] = np.where(fim[y] == fim['pred'], 1, 0)
fim.to_excel(pathout + 'Modelo ' + nomemodelo + ' Pred ' + name + '.xlsx', encoding = 'latin1', index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the doc