In [49]:
%store -r df_final

In [50]:
!pip install xgboost
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import *
from sklearn.feature_selection import RFECV, SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler, RobustScaler
import numpy as np



In [51]:
from sklearn.model_selection import StratifiedShuffleSplit

X = df_final
y = df_final['sku_encoded']

# Criar um objeto StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0)

# Obter os índices para a divisão
for indice_a, indice_b in sss.split(X, y):
    dados_a = df_final.iloc[indice_a]
    dados_b = df_final.iloc[indice_b]


In [52]:
featuresModels = ['ano', 'semana_do_ano', 'unit_price',
       'sku_height', 'sku_width', 'sku_length', 'sku_weight',
       'avg_website_visits_last_week', 'supplier_delivery_time',
       'shipment_type_crossdocking', 'shipment_type_próprio',
       'product_department_Cama e Banho', 'product_department_Decoração',
       'product_department_Gamer', 'product_department_Infantil',
       'product_department_Keva', 'product_department_Móveis',
       'origin_country_Importado', 'origin_country_Nacional',
       'process_costing_no', 'process_costing_yes',
       'flag_bundle_SKU vendido em conjunto ou sozinho',
       'flag_bundle_SKU vendido somente sozinho', 'has_stock_não',
       'has_stock_tem', 'color_encoded', 'sku_encoded',
       'anchor_category_encoded', 'product_category_encoded',
       'dollar_quotation']

In [53]:
dados_a.reset_index(inplace=True, drop=True)

In [54]:
#Criando um dataframe a partir do df_final, agora readaptado para as features selecionadas
df_parameters = dados_a.drop(columns=df_final.columns.difference(featuresModels))

# Separando as variáveis independentes e dependentes (X e Y)
X_a = df_parameters
y_a = dados_a['items_sold']

# Dividindo o conjunto de dados em conjuntos de treinamento e teste
X_a_train, X_a_test, y_a_train, y_a_test = train_test_split(X_a, y_a, test_size=0.2, random_state=95)


In [55]:
# Instanciando o modelo com os hiperparâmetros definidos
xgb_reg = XGBRegressor(random_state=42)
xgb_reg.fit(X_a_train, y_a_train)

rf = RandomForestRegressor(random_state=42)
rf.fit(X_a_train, y_a_train)

gbr = GradientBoostingRegressor(random_state=42)
gbr.fit(X_a_train, y_a_train)


In [56]:
xgb_pred = xgb_reg.predict(X_a_test)
rf_pred = rf.predict(X_a_test)
gbr_pred = gbr.predict(X_a_test)

In [57]:
# Combinando as predições
X_test_meta = np.column_stack((xgb_pred, rf_pred, gbr_pred))

# Treinando o modelo meta
meta_model = RandomForestRegressor()
meta_model.fit(X_test_meta, y_a_test)

In [58]:
scorer = make_scorer(mean_squared_error, greater_is_better=False)
rfe = RFECV(meta_model, step=1, cv=5, scoring=scorer, n_jobs=-1)
rfe = rfe.fit(X_test_meta, y_a_test)
print("Feature ranking: ", rfe.ranking_)

Feature ranking:  [1 1 1]


In [59]:
mask = rfe.get_support()

# Convertendo a mascara para um array de numpy para indexação
mask = np.array(mask)

# Usando indexação booleana para pegar as features selecionadas
best_features = [feature for i, feature in enumerate(featuresModels) if i < len(mask) and mask[i]]

print("All features: ", X.shape[1])
print(featuresModels)

print("Selected best: ", len(best_features))
print(best_features)

All features:  32
['ano', 'semana_do_ano', 'unit_price', 'sku_height', 'sku_width', 'sku_length', 'sku_weight', 'avg_website_visits_last_week', 'supplier_delivery_time', 'shipment_type_crossdocking', 'shipment_type_próprio', 'product_department_Cama e Banho', 'product_department_Decoração', 'product_department_Gamer', 'product_department_Infantil', 'product_department_Keva', 'product_department_Móveis', 'origin_country_Importado', 'origin_country_Nacional', 'process_costing_no', 'process_costing_yes', 'flag_bundle_SKU vendido em conjunto ou sozinho', 'flag_bundle_SKU vendido somente sozinho', 'has_stock_não', 'has_stock_tem', 'color_encoded', 'sku_encoded', 'anchor_category_encoded', 'product_category_encoded', 'dollar_quotation']
Selected best:  3
['ano', 'semana_do_ano', 'unit_price']


In [60]:
featuresModels = best_features

In [61]:
# Criando a grid de parâmetros a serem testados
param_grid = {
    'n_estimators': [100,150,200],
    'max_depth': [7,10,15],
}

# Definindo o "score" para avaliarmos qual será a melhor combinação de hiperparâmetros para o modelo
scorer = make_scorer(mean_squared_error, greater_is_better=False)

In [62]:
# Realizando o grid search (remover n_jobs = -1 caso não queira usar todos os cores do computador)
grid_search = GridSearchCV(estimator=meta_model, param_grid=param_grid, cv=5,scoring=scorer, n_jobs=-1, refit=True)

grid_search.fit(X_test_meta, y_a_test)

print("Melhores Hiperparâmetros: ", grid_search.best_params_)

# Conseguindo os melhores estimadores
best_meta_model = grid_search.best_estimator_

print('Best Model', best_meta_model)

Melhores Hiperparâmetros:  {'max_depth': 7, 'n_estimators': 200}
Best Model RandomForestRegressor(max_depth=7, n_estimators=200)


In [70]:
# Combinando as predições
X_test_meta = np.column_stack((xgb_pred, rf_pred, gbr_pred))

# Treinando o modelo meta
meta_model = best_meta_model
meta_model.fit(X_test_meta, y_a_test)

In [71]:
df_parameters = dados_b.drop(columns=df_final.columns.difference(featuresModels))

X_b = df_parameters
y_b = dados_b['items_sold']

In [72]:
xgb_pred_b = xgb_reg.predict(X_b)
rf_pred_b = rf.predict(X_b)
gbr_pred_b = gbr.predict(X_b)

ValueError: feature_names mismatch: ['ano', 'semana_do_ano', 'unit_price', 'sku_height', 'sku_width', 'sku_length', 'sku_weight', 'avg_website_visits_last_week', 'supplier_delivery_time', 'shipment_type_crossdocking', 'shipment_type_próprio', 'product_department_Cama e Banho', 'product_department_Decoração', 'product_department_Gamer', 'product_department_Infantil', 'product_department_Keva', 'product_department_Móveis', 'origin_country_Importado', 'origin_country_Nacional', 'process_costing_no', 'process_costing_yes', 'flag_bundle_SKU vendido em conjunto ou sozinho', 'flag_bundle_SKU vendido somente sozinho', 'has_stock_não', 'has_stock_tem', 'color_encoded', 'sku_encoded', 'anchor_category_encoded', 'product_category_encoded', 'dollar_quotation'] ['ano', 'semana_do_ano', 'unit_price']
expected origin_country_Importado, product_department_Gamer, product_category_encoded, product_department_Infantil, flag_bundle_SKU vendido em conjunto ou sozinho, sku_width, product_department_Móveis, origin_country_Nacional, process_costing_no, has_stock_não, sku_encoded, product_department_Cama e Banho, anchor_category_encoded, shipment_type_crossdocking, sku_height, avg_website_visits_last_week, has_stock_tem, product_department_Keva, flag_bundle_SKU vendido somente sozinho, color_encoded, sku_weight, supplier_delivery_time, product_department_Decoração, shipment_type_próprio, dollar_quotation, process_costing_yes, sku_length in input data

In [None]:
X_new_meta = np.column_stack((xgb_pred_b, rf_pred_b, gbr_pred_b))

y_new_pred = meta_model.predict(X_new_meta)


In [None]:
from sklearn.metrics import mean_squared_error, mean_squared_error, r2_score

#Métrica do Erro Quadrático Médio
mse = mean_squared_error(y_b, y_new_pred)
print("Mean Squared Error:", mse)

#Métrica da Raiz Quadrada do Erro Médio
RMSE = np.sqrt(mse)
print('RMSE:', RMSE)

#Métrica do Coeficiente de Determinação R² score
r2 = r2_score(y_b, y_new_pred)
print("R2 score:", r2)

Mean Squared Error: 33.96893054995411
RMSE: 5.828287102567453
R2 score: 0.6621255276796304
