# Modelo

In [1]:
nomemodelo = 'Light GBM'

## Caminhos

In [2]:
import os

#Folder Inicial
path = os.getcwd()

#Subpastas
pathin = path + '\\Entrada\\'
pathfixo = path + '\\Fixo\\'
pathout = path + '\\Saida\\'
pathparcial = path + '\\Parcial\\'
pathaux = path + '\\Auxiliar\\'

## Pacotes

In [3]:
import dill
import pickle
import pandas as pd
pd.set_option('max_colwidth', 3000)

import numpy as np

from time import gmtime, strftime

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
import lightgbm as lgb

from sklearn.externals import joblib

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.utils import parallel_backend

In [4]:
import sklearn
print(sklearn.__version__)

0.20.3


## Lendo Arquivos

In [5]:
file = 'Treino'
treino = pd.read_pickle(pathparcial + 'Arquivo0 ' + file + '.pkl')

file = 'Teste'
teste = pd.read_pickle(pathparcial + 'Arquivo0 ' + file + '.pkl')

print(treino.shape)
print(teste.shape)

mod = joblib.load(pathaux + 'Modelo ' + nomemodelo + '.pkl')

with open(pathaux + 'Variaveis Modelo ' + nomemodelo + '.pickle', 'rb') as f:
    colunas = dill.load(f)
    
varx = list(colunas)[0]
y = list(colunas)[1]

(750, 58)
(250, 58)


## Variaveis Importantes

In [6]:
def Feature_Importance(mod, nomemodelo, X, y):
    many = ['Linear SVM', 'Logistica', 'Naive Bayes']

    k = mod.best_estimator_
    featurenames = list(X)
    
    if nomemodelo in many:
        #Logistica, SVM, Naive Bayes
        Features = pd.DataFrame(k.coef_.tolist())
        Features.columns = featurenames
        if len(y.unique()) > 2:
            Features.index = sorted(y.unique())
    else:
        #Arvores
        Features = pd.DataFrame({'features': featurenames, 'value': k.feature_importances_.tolist()})
        
    Features.to_excel(pathout + 'Features 2 ' + nomemodelo + '.xlsx', encoding = 'latin1', index = True)
    
Feature_Importance(mod = mod, nomemodelo = nomemodelo, X = treino[varx], y = treino[y])

## Preditos

In [7]:
#Predito
treino['pred'] = mod.best_estimator_.predict(treino[varx])
teste['pred'] = mod.best_estimator_.predict(teste[varx])

#Treino
predtreino = treino['pred'] #Predito
ytreino = treino[y]         #Real

#Teste
predteste = teste['pred'] #Predito
yteste = teste[y]         #Real

#Label
label = sorted(ytreino.unique())

labeltreino = sorted(pd.Series(ytreino.unique().tolist() + predtreino.unique().tolist()).unique())
labelteste = sorted(pd.Series(yteste.unique().tolist() + predteste.unique().tolist()).unique())

print(label)
print(labeltreino)
print(labelteste)

[0, 1]
[0, 1]
[0, 1]


## Resultados

In [8]:
def results(yreal, ypred, name, lab):
    
    ac = accuracy_score(y_true = yreal, y_pred = ypred).tolist()
    #Comum
    pr = precision_score(y_true = yreal, y_pred = ypred, average = None).tolist()
    re = recall_score(y_true = yreal, y_pred = ypred, average = None).tolist()
    f1 = f1_score(y_true = yreal, y_pred = ypred, average = None).tolist()
    #Macro
    prM = precision_score(y_true = yreal, y_pred = ypred, average = 'macro').tolist()
    reM = recall_score(y_true = yreal, y_pred = ypred, average = 'macro').tolist()
    f1M = f1_score(y_true = yreal, y_pred = ypred, average = 'macro').tolist()
    #Weighted
    prW = precision_score(y_true = yreal, y_pred = ypred, average = 'weighted').tolist()
    reW = recall_score(y_true = yreal, y_pred = ypred, average = 'weighted').tolist()
    f1W = f1_score(y_true = yreal, y_pred = ypred, average = 'weighted').tolist()

    results = pd.DataFrame({
        'Accuracy ' + name: ac,
        'Precision ' + name: pr, 'Recall ' + name: re, 'F1 ' + name: f1, 
        'Precision Macro ' + name: pr, 'Recall Macro ' + name: re, 'F1 Macro ' + name: f1, 
        'Precision Weighted ' + name: pr, 'Recall Weighted ' + name: re, 'F1 Weighted ' + name: f1, 
    })

    results['label'] = lab
    
    return results

#Treino
results_treino = results(yreal = ytreino, ypred = predtreino, name = 'Treino', lab = labeltreino)
#Teste
results_teste = results(yreal = yteste, ypred = predteste, name = 'Teste', lab = labeltreino)

### Juntando Resultados de Treino e Teste

In [9]:
results = results_treino.merge(results_teste, left_on = results_treino['label'], right_on = results_teste['label'])
results = results.drop(['label_x', 'label_y'], axis = 1)
results.rename(columns = {'key_0': 'label'}, inplace = True)
results = results.T
results.to_excel(pathout + 'Resultados Treino Teste ' + nomemodelo + '.xlsx', encoding = 'latin1', index = True)
#results

## Matriz de Confusao

In [10]:
def matrizdeconfusao(yreal, ypred, label, name):
    
    #Precision
    pr = precision_score(y_true = yreal, y_pred = ypred, average = None).tolist()
    #Recall
    re = recall_score(y_true = yreal, y_pred = ypred, average = None).tolist()
 
    #Matriz de Confusao
    cm = confusion_matrix(y_true = yreal, y_pred = ypred, sample_weight = None)
    
    #DataFrame
    cm = pd.DataFrame(data = cm, index = label, columns = label)
    
    #Zero para NA
    cm = cm.replace(0, np.nan)
    
    #Classses
    names = list(cm)
    
    #Somando para ter TOTAL POR CLASSE
    cm['NReal'] = cm.sum(axis = 1)
    
    #Calculando a soma das colunas = TOTAL PREDITO POR CLASSE
    pp = pd.DataFrame(cm.sum(axis = 0)).transpose()
    pp.index = ['NPred']
    
    #Adicionando linha com a soma das colunas = TOTAL PREDITO POR CLASSE
    cm = cm.append(pp)
    
    #Linha de PRECISION
    pp = pd.DataFrame(pr).transpose()
    pp.index = ['Precision']
    pp.columns = names
    
    cm = cm.append(pp, sort = False)
    
    #Coluna de Recall
    pp = pd.DataFrame(re)
    pp.columns = ['Recall']
    pp.index = names
    
    cm = pd.merge(cm, pp, how = 'outer', left_index = True, right_index = True)
    
    cm.to_excel(pathout + 'Resultados MC ' + nomemodelo + ' ' + name + '.xlsx', encoding = 'latin1', index = True)
    
    return cm

cmtr = matrizdeconfusao(yreal = ytreino, ypred = predtreino, label = labeltreino, name = 'Treino')
cmte = matrizdeconfusao(yreal = yteste, ypred = predteste, label = labelteste, name = 'Teste')

  result = result.union(other)
  return this.join(other, how=how, return_indexers=return_indexers)


In [11]:
cmtr

Unnamed: 0,0,1,NReal,Recall
0,502.0,23.0,525.0,0.95619
1,71.0,154.0,225.0,0.684444
NPred,573.0,177.0,750.0,
Precision,0.876091,0.870056,,


In [12]:
cmte

Unnamed: 0,0,1,NReal,Recall
0,157.0,18.0,175.0,0.897143
1,44.0,31.0,75.0,0.413333
NPred,201.0,49.0,250.0,
Precision,0.781095,0.632653,,
