# **Gradient Boosting com XGBoost**

## Bibliotecas

In [1]:
# Libs
import math
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Base de Dados

In [2]:
# Dados
df_iris = sns.load_dataset('iris')
df_iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
df_iris.shape

(150, 5)

In [4]:
df_iris['species'].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [5]:
catg, rotulos = pd.factorize(df_iris['species'])
df_iris['species'] = catg
df_iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


## Separando a base de dados

In [6]:
# X e Y
X_iris = df_iris.drop(['species'], axis=1)
y_iris =df_iris['species']

In [7]:
X_iris_train, X_iris_test, y_iris_train, y_iris_test = train_test_split(X_iris, y_iris, random_state=42)

## Criando o Classificador XGBoot

In [8]:
classificador_xgb = xgb.XGBClassifier()
type(classificador_xgb)

xgboost.sklearn.XGBClassifier

In [9]:
def ResultadoCV(classificador, X, y, **kwargs):
    return cross_val_score(classificador, X, y, **kwargs).mean()

In [10]:
# Resultado
resultado_cv = ResultadoCV(classificador_xgb, X_iris_train, y_iris_train)
resultado_cv

0.9280632411067196

## Pequena tunagem!

In [11]:
# o Classificador
clf = xgb.XGBClassifier()
clf.objective

'binary:logistic'

In [12]:
# O Treinamento 
clf = xgb.XGBClassifier().fit(X_iris_train, y_iris_train)
clf.objective

'multi:softprob'

In [13]:
classificador_xgb_tunado = xgb.XGBClassifier(max_depth=2)
ResultadoCV(classificador_xgb_tunado, X_iris_train, y_iris_train)

0.9193675889328062

## Tipos de Classificador

In [14]:
classificador_xgb_dart = xgb.XGBClassifier(booster='dart')
ResultadoCV(classificador_xgb_dart, X_iris_train, y_iris_train)

0.9280632411067196

In [15]:
classificador_xgb_linear = xgb.XGBClassifier(booster='gblinear')
ResultadoCV(classificador_xgb_linear, X_iris_train, y_iris_train)

0.9462450592885375

In [16]:
# Capricho de legibilidade
classificador_campeao = classificador_xgb_linear

In [17]:
# Treinando o modelo
classificador_campeao.fit(X_iris_train, y_iris_train)

In [18]:
predicoes_iris = classificador_campeao.predict(X_iris_test)
predicoes_iris[:10]

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1], dtype=int64)

In [19]:
# Calculando o número de acerto
(predicoes_iris == y_iris_test).sum()

38

In [20]:
acertos = (predicoes_iris == y_iris_test).sum()
total_observacoes = len(y_iris_test)
taxa_acerto = 100 * acertos / total_observacoes
print(taxa_acerto)

100.0


## Regressão com XGBoost

In [21]:
# Dados
df_mpg =sns.load_dataset('mpg')
df_mpg.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [22]:
df_mpg.shape

(398, 9)

In [23]:
df_mpg.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
count,398.0,398.0,398.0,392.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,104.469388,2970.424623,15.56809,76.01005
std,7.815984,1.701004,104.269838,38.49116,846.841774,2.757689,3.697627
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0
25%,17.5,4.0,104.25,75.0,2223.75,13.825,73.0
50%,23.0,4.0,148.5,93.5,2803.5,15.5,76.0
75%,29.0,8.0,262.0,126.0,3608.0,17.175,79.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0


In [24]:
# X e y
X_mpg = df_mpg.drop(['mpg', 'origin', 'name'], axis=1)
y_mpg = df_mpg['mpg']
X_mpg

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year
0,8,307.0,130.0,3504,12.0,70
1,8,350.0,165.0,3693,11.5,70
2,8,318.0,150.0,3436,11.0,70
3,8,304.0,150.0,3433,12.0,70
4,8,302.0,140.0,3449,10.5,70
...,...,...,...,...,...,...
393,4,140.0,86.0,2790,15.6,82
394,4,97.0,52.0,2130,24.6,82
395,4,135.0,84.0,2295,11.6,82
396,4,120.0,79.0,2625,18.6,82


In [25]:
X_mpg_train, X_mpg_test, y_mpg_train, y_mpg_test = train_test_split(X_mpg, y_mpg, random_state=42)
X_mpg_train.shape, X_mpg_test.shape, y_mpg_train.shape, y_mpg_test.shape

((298, 6), (100, 6), (298,), (100,))

# Treinando

In [26]:
clf = xgb.XGBRegressor()
clf.objective

'reg:squarederror'

In [27]:
regressor_xgb= xgb.XGBRegressor()
ResultadoCV(regressor_xgb, X_mpg_train, y_mpg_train, scoring='neg_root_mean_squared_error')

-3.165568384433206

In [28]:
regressor_xgb_tunado = xgb.XGBRegressor(max_depth=2)
ResultadoCV(regressor_xgb_tunado, X_mpg_train, y_mpg_train, scoring='neg_root_mean_squared_error')

-3.2382354328910514

In [29]:
regressor_xgb_linear = xgb.XGBRegressor(booster='gblinear')
ResultadoCV(regressor_xgb_linear, X_mpg_train, y_mpg_train, scoring='neg_root_mean_squared_error')

-4.272957476255639

In [30]:
regresso_campeao = regressor_xgb
regresso_campeao = xgb.XGBRegressor(eval_metric="rmse")
regresso_campeao.fit(X_mpg_train, y_mpg_train)

In [31]:
predicoes_mpg = regresso_campeao.predict(X_mpg_test)
predicoes_mpg[:10]

array([29.7056  , 30.87353 , 21.187384, 15.274967, 13.182645, 26.038195,
       28.141481, 12.008284, 17.807064, 17.626806], dtype=float32)

In [32]:
mse = mean_squared_error(y_mpg_test, predicoes_mpg)
print(mse)

8.002868120652916


In [33]:
rmse = math.sqrt(mse)
rmse

2.8289340962017686