https://www.youtube.com/watch?v=eRtwENJae0c

In [83]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
import seaborn as sns
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

import os

## **Carregando os Dados**

In [5]:
dados = sns.load_dataset('iris')
dados.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [8]:
dados['species'].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

## **Separando as Bases**

In [19]:
X = dados.drop(columns=['species'], axis=1)
y = dados['species']

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=42)

## **Criando o Objeto com o Classificador LightGBM**

In [22]:
classificador_lgbm = lgb.LGBMClassifier(learning_rate=0.2, max_depth=2)
type(classificador_lgbm)

lightgbm.sklearn.LGBMClassifier

In [None]:
from sklearn.model_selection import cross_val_score

cv_lgbm = cross_val_score(classificador_lgbm, Xtrain, ytrain).mean() * 100

In [40]:
print(cv_lgbm)
# 94.58498023715414

94.58498023715414


## **Alterando para Bagging - RandomForest**

In [None]:
# bagging_fraction=0.80 -> pegando 80% da Base de treinamento em cada árvore (em votação)
# bagging_freq=1 -> a cada interação irá pegar novos dados destes 80%
classsificador_lgbm_rf = lgb.LGBMClassifier(boosting_type='rf', bagging_freq=1, bagging_fraction=0.8)

cv_lgbm_rf = cross_val_score(classsificador_lgbm_rf, Xtrain, ytrain).mean() * 100

In [41]:
print(cv_lgbm_rf)
# 92.80632411067194

92.80632411067194


In [None]:
# dart = Dropouts Meet Multiple Additive Regression Trees
# goss = Gradient-based On-Side Sampling
# rf   = Random Forest

classsificador_lgbm_rf_dart = lgb.LGBMClassifier(boosting_type='dart')

cv_lgbm_dart = cross_val_score(classsificador_lgbm_rf_dart, Xtrain, ytrain).mean() * 100

In [39]:
print(cv_lgbm_dart)
# 94.62450592885375

94.62450592885375


## **Testar de acordo com o Melhor Classificador**

In [None]:
melhor_cv = max(cv_lgbm, cv_lgbm_rf, cv_lgbm_dart)

if (melhor_cv == cv_lgbm):
    classificador_final = classificador_lgbm
elif (melhor_cv == cv_lgbm_rf):
    classificador_final = classsificador_lgbm_rf
else:
    classificador_final = classsificador_lgbm_rf_dart
    
classificador_final.fit(Xtrain, ytrain)

predicoes = classificador_final.predict(Xtest)

In [38]:
print(predicoes[: 10])

['versicolor' 'setosa' 'virginica' 'versicolor' 'versicolor' 'setosa'
 'versicolor' 'virginica' 'versicolor' 'versicolor']


In [37]:
# Calculando o Número de Acertos
print('Acertos:', (predicoes == ytest).sum() / len(ytest) * 100)

Acertos: 100.0


## **Vamos para Regressão ?**

Vamos tentar prever o consumo dos carros

In [42]:
dados = sns.load_dataset('mpg')
dados.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [43]:
dados.shape

(398, 9)

In [62]:
dados.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
count,398.0,398.0,398.0,392.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,104.469388,2970.424623,15.56809,76.01005
std,7.815984,1.701004,104.269838,38.49116,846.841774,2.757689,3.697627
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0
25%,17.5,4.0,104.25,75.0,2223.75,13.825,73.0
50%,23.0,4.0,148.5,93.5,2803.5,15.5,76.0
75%,29.0,8.0,262.0,126.0,3608.0,17.175,79.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0


In [44]:
# mpg = Miles Per Galon -> Kilômetros por Litro
X = dados.drop(columns=['mpg', 'origin', 'name'], axis=1)
y = dados['mpg']

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=42)

In [51]:
metrics._scorer._SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'matthews_corrcoef', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_root_mean_squared_log_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'd2_absolute_error_score', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'positive_likelihood_ratio', 'neg_negative_likelihood_ratio', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_w

In [None]:
regressor_lgbm = lgb.LGBMRegressor()

cv_lgbm = cross_val_score(regressor_lgbm, Xtrain, ytrain, scoring='neg_root_mean_squared_error').mean()

In [77]:
print(cv_lgbm)

-3.0602968789895


In [None]:
regressor_lgbm_dart = lgb.LGBMRegressor(boosting_type='dart')
cv_lgbm_dart = cross_val_score(regressor_lgbm_dart, Xtrain, ytrain, scoring='neg_root_mean_squared_error').mean()

In [79]:
print(cv_lgbm_dart)

-3.8958190536698334


## **Melhor Modelo**

In [None]:
regressor_lgbm.fit(Xtrain, ytrain, eval_metric='root_mean_squared_error')

predicoes = regressor_lgbm.predict(Xtest)

In [86]:
predicoes[:10]

array([32.62089047, 30.47819635, 21.1453571 , 15.63407173, 13.12103143,
       24.96860052, 25.72020663, 12.72353355, 17.82671692, 19.06759737])

In [87]:
ytest[:10]

198    33.0
396    28.0
33     19.0
208    13.0
93     14.0
84     27.0
373    24.0
94     13.0
222    17.0
126    21.0
Name: mpg, dtype: float64

In [90]:

mse = (mean_squared_error(ytest, predicoes))

In [92]:
import math

rmse = math.sqrt(mse)
print(rmse)

2.526606943873821
