In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn import datasets
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

In [2]:
# VAMOS DE REGRESSÃO.
# Prever qual o consumo

df_mpg = sns.load_dataset('mpg')

df_mpg.head(10)

#df_mpg.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino
5,15.0,8,429.0,198.0,4341,10.0,70,usa,ford galaxie 500
6,14.0,8,454.0,220.0,4354,9.0,70,usa,chevrolet impala
7,14.0,8,440.0,215.0,4312,8.5,70,usa,plymouth fury iii
8,14.0,8,455.0,225.0,4425,10.0,70,usa,pontiac catalina
9,15.0,8,390.0,190.0,3850,8.5,70,usa,amc ambassador dpl


In [3]:
df_mpg.shape

(398, 9)

In [4]:
# Tranformando categoricos em numeros
LabelEncoder2 = LabelEncoder()

transform2 = df_mpg.select_dtypes(include="object")

for i in transform2:
  if i != "class":
    df_mpg[i] = LabelEncoder2.fit_transform(df_mpg[i])

In [5]:
previsoes = df_mpg.iloc[:, 0:9].values
classe = df_mpg.iloc[:, 0].values

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
  previsoes,
  classe,
  random_state= 42
)

In [7]:
# Criando o regressor

regressor_xgb = xgb.XGBRFRegressor()
regressor_xgb

In [8]:
# O SCORING do sklearn tenta maximixar, quanti maior melhor, mas no caso de um erro, quanto maior, pior. Daí negativamos com neg o root_mean_squared_error. E quando vc quer o minimizador, é só fazer ao contrário

# O Root Mean Squared Error (RMSE), ou erro quadrático médio raiz, é uma métrica de avaliação de modelos de regressão. Ele mede o quanto os valores previstos pelo modelo estão distantes dos valores reais. O RMSE é útil para entender o desempenho de um modelo, pois representa o erro médio das previsões em relação aos valores reais, em suas unidades originais.

cross_val = cross_val_score(regressor_xgb, X_train, y_train, scoring='neg_root_mean_squared_error').mean()

cross_val

-0.36044985352500514

In [9]:
regressor_xgb = xgb.XGBRegressor(max_depth=2)

cross_val_score(regressor_xgb, X_train, y_train, scoring='neg_root_mean_squared_error').mean()

-0.3805082443921361

In [10]:
regressor_xgb_gblinear = xgb.XGBRegressor(booster='gblinear')

cross_val_score(regressor_xgb_gblinear, X_train, y_train, scoring='neg_root_mean_squared_error').mean()

-1.5464582110972858

In [11]:
regressor_xgb_dart = xgb.XGBRegressor(booster='dart')

cross_val_score(regressor_xgb_dart, X_train, y_train, scoring='neg_root_mean_squared_error').mean()

-0.3126683087675536

In [12]:
# O que teve melhor desempenho foi o regressor padrão.
# O eval_metric é o evaluation root_mean_squarred_error
# Passar eval_metric='rmsle' (Root Mean Squared Logarithmic Error) como parâmetro ao XGBRegressor indica que o modelo utilizará essa métrica de avaliação para medir o erro entre as previsões e os valores reais.

regressor_melhor = xgb.XGBRegressor(eval_metric='rmsle')
regressor_melhor.fit(X_train, y_train )

In [13]:
predict = regressor_melhor.predict(X_test)

predict[:10]

array([33.18965 , 28.022264, 18.990038, 13.006941, 13.977067, 27.001446,
       24.052969, 12.987987, 16.910759, 21.031271], dtype=float32)

In [14]:
y_test[:10]

array([33., 28., 19., 13., 14., 27., 24., 13., 17., 21.])

In [15]:
df_mpg.head(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,2,49
1,15.0,8,350.0,165.0,3693,11.5,70,2,36
2,18.0,8,318.0,150.0,3436,11.0,70,2,231
3,16.0,8,304.0,150.0,3433,12.0,70,2,14
4,17.0,8,302.0,140.0,3449,10.5,70,2,161
5,15.0,8,429.0,198.0,4341,10.0,70,2,141
6,14.0,8,454.0,220.0,4354,9.0,70,2,54
7,14.0,8,440.0,215.0,4312,8.5,70,2,223
8,14.0,8,455.0,225.0,4425,10.0,70,2,241
9,15.0,8,390.0,190.0,3850,8.5,70,2,1


In [16]:
# Calculando o erro das previsões com os valores reais

from sklearn.metrics import mean_squared_error
import math

mse = mean_squared_error(y_test, predict)
display(mse)

rmse = math.sqrt(mse)
rmse

0.022963681672768858

0.15153772359636677

In [17]:
df_mpg02 = pd.read_excel('Book1.xlsx')

df_mpg02

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,10.0,8,100,500,200,12,70,usa,chevet
1,10.0,8,100,500,200,12,70,usa,chevet
2,10.0,8,100,800,200,11,70,usa,asdf
3,10.0,8,100,800,200,12,70,usa,asdf
4,10.0,8,398,800,700,13,70,usa,asdf
5,10.0,8,450,800,700,13,70,usa,xcvb
6,10.0,8,460,250,700,15,70,usa,xcvb
7,10.0,8,490,250,700,11,70,usa,xcvb
8,20.0,8,800,250,700,14,70,usa,camaro
9,20.0,8,800,250,700,11,70,usa,camaro


In [18]:
LabelEncoder03 = LabelEncoder()

colunas = df_mpg02.select_dtypes(include='object')

for i in colunas.columns:
  if i != 'class':
    df_mpg02[i] = LabelEncoder03.fit_transform(df_mpg02[i])



In [19]:


regressor_melhor.predict(df_mpg02)

array([10.077501, 10.077501, 10.077501, 10.077501, 10.06398 , 10.062695,
       10.059215, 10.066149, 19.979881, 19.98052 , 20.035267, 20.024487,
       20.022062, 19.95509 , 29.897255, 29.836817, 29.83383 , 29.790865,
       29.87832 , 46.58368 ], dtype=float32)