<a href="https://colab.research.google.com/github/MBrandao07/Previsao_Demandas_Market_Show/blob/main/Codigos/3_Previs%C3%A3o_Demanda_Modelo_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 4- Modelagem - Modelo Baseline

### Instalando o catboost

In [None]:
!pip install catboost



## Funções

In [None]:
# Função para gerar os metadados do dataframe
def generate_metadata(dataframe):
    """
    Gera um dataframe contendo metadados das colunas do dataframe fornecido.

    :param dataframe: DataFrame para o qual os metadados serão gerados.
    :return: DataFrame contendo metadados.
    """

    # Coleta de metadados básicos
    metadata = pd.DataFrame({
        'nome_variavel': dataframe.columns,
        'tipo': dataframe.dtypes,
        'qt_nulos': dataframe.isnull().sum(),
        'percent_nulos': round((dataframe.isnull().sum() / len(dataframe))* 100,2),
        'cardinalidade': dataframe.nunique(),
    })
    metadata=metadata.sort_values(by='percent_nulos',ascending=False)
    metadata = metadata.reset_index(drop=True)

    return metadata

### Instalando as bibliotecas

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import time
import lightgbm as lgb
import catboost as cb
import joblib
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='lightgbm')

### Conectando ao drive

In [None]:
# Conectando ao google drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Lendo os datasets

In [None]:
train = pd.read_csv('/content/drive/MyDrive/2 - Projetos Portfolio/Previsão de demanda/Base de dados/train_data_original.csv')
val = pd.read_csv('/content/drive/MyDrive/2 - Projetos Portfolio/Previsão de demanda/Base de dados/val_data_original.csv')

In [None]:
train.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,oil_price,city,state,type_stores,cluster,transactions,type_holidays,locale,description,transferred,target
0,650430,2014-01-02,1,AUTOMOTIVE,3.0,0,95.14,Quito,Pichincha,D,13,1875.0,Normal Day,Normal Day,Normal Day,Normal Day,31.0
1,652212,2014-01-03,1,AUTOMOTIVE,0.0,0,93.66,Quito,Pichincha,D,13,1817.0,Normal Day,Normal Day,Normal Day,Normal Day,35.0
2,653994,2014-01-04,1,AUTOMOTIVE,0.0,0,93.12,Quito,Pichincha,D,13,1294.0,Normal Day,Normal Day,Normal Day,Normal Day,35.0
3,655776,2014-01-05,1,AUTOMOTIVE,1.0,0,93.12,Quito,Pichincha,D,13,489.0,Normal Day,Normal Day,Normal Day,Normal Day,36.0
4,657558,2014-01-06,1,AUTOMOTIVE,6.0,0,93.12,Quito,Pichincha,D,13,1754.0,Normal Day,Normal Day,Normal Day,Normal Day,39.0


In [None]:
val.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,oil_price,city,state,type_stores,cluster,transactions,type_holidays,locale,description,transferred,target
0,2596374,2017-01-01,1,AUTOMOTIVE,0.0,0,52.36,Quito,Pichincha,D,13,1921.445466,Normal Day,Normal Day,Normal Day,Normal Day,45.0
1,2598156,2017-01-02,1,AUTOMOTIVE,5.0,0,52.36,Quito,Pichincha,D,13,516.0,Normal Day,Normal Day,Normal Day,Normal Day,42.0
2,2599938,2017-01-03,1,AUTOMOTIVE,4.0,0,52.36,Quito,Pichincha,D,13,1946.0,Normal Day,Normal Day,Normal Day,Normal Day,41.0
3,2601720,2017-01-04,1,AUTOMOTIVE,1.0,0,53.26,Quito,Pichincha,D,13,1905.0,Normal Day,Normal Day,Normal Day,Normal Day,42.0
4,2603502,2017-01-05,1,AUTOMOTIVE,2.0,0,53.77,Quito,Pichincha,D,13,1807.0,Normal Day,Normal Day,Normal Day,Normal Day,42.0


In [None]:
generate_metadata(train)

Unnamed: 0,nome_variavel,tipo,qt_nulos,percent_nulos,cardinalidade
0,id,int64,0,0.0,180180
1,date,object,0,0.0,1092
2,store_nbr,int64,0,0.0,5
3,family,object,0,0.0,33
4,sales,float64,0,0.0,44453
5,onpromotion,int64,0,0.0,223
6,oil_price,float64,0,0.0,676
7,city,object,0,0.0,2
8,state,object,0,0.0,2
9,type_stores,object,0,0.0,1


In [None]:
generate_metadata(val)

Unnamed: 0,nome_variavel,tipo,qt_nulos,percent_nulos,cardinalidade
0,id,int64,0,0.0,22440
1,date,object,0,0.0,136
2,store_nbr,int64,0,0.0,5
3,family,object,0,0.0,33
4,sales,float64,0,0.0,7494
5,onpromotion,int64,0,0.0,163
6,oil_price,float64,0,0.0,83
7,city,object,0,0.0,2
8,state,object,0,0.0,2
9,type_stores,object,0,0.0,1


## Avaliando o modelo atual

In [None]:
# carregando o modelo salvo
pipeline_vigente = joblib.load('/content/drive/MyDrive/2 - Projetos Portfolio/Previsão de demanda/Base de dados/pipeline_model.pkl')

# definindo as features e target
features = [col for col in train.columns if col not in ['id','date','target']]
target = 'target'

# separando as colunas numéricas e categóricas
numeric_features = train[features].select_dtypes(include=['int64', 'float64']).columns
categorical_features = train[features].select_dtypes(include=['object']).columns

train.loc[:,categorical_features] = train[categorical_features].astype(str)
train.loc[:,categorical_features] = train[categorical_features].astype(str)

X_train = train[features]
y_train = train[target]
X_test = val[features]
y_test = val[target]

# Previsões no treino
train_predictions = pipeline_vigente.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, train_predictions))
train_mae = mean_absolute_error(y_train, train_predictions)

# Previsões no teste
test_predictions = pipeline_vigente.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, test_predictions))
test_mae = mean_absolute_error(y_test, test_predictions)

results = []
model_name = 'Modelo Vigente'

results.append((model_name, train_rmse, test_rmse, train_mae, test_mae))

# mostrando os resultados
results_df1_vigente = pd.DataFrame(results, columns=['Model', 'Train RMSE', 'Test RMSE', 'Train MAE', 'Test MAE'])

results_df1_vigente

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Unnamed: 0,Model,Train RMSE,Test RMSE,Train MAE,Test MAE
0,Modelo Vigente,11089.268817,12486.56653,3387.497013,3333.040899


## Treinando o novo modelo baseline

Nessa etapa treinaremos diversos modelos diferentes com diferentes períodos da base.

### Treinando os modelos com o maior histórico possível

In [None]:
X_train = train[features]
y_train = train[target]
X_test = val[features]
y_test = val[target]

# Preprocessamento para dados numéricos e categóricos
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Definir os modelos a serem testados
models = {
    'LinearRegression': LinearRegression(),
    'LightGBM': lgb.LGBMRegressor(n_estimators=100, random_state=42, verbose=-1),
    'CatBoost': cb.CatBoostRegressor(iterations=100, depth=6, learning_rate=0.1, loss_function='RMSE', verbose=0),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42, verbosity=0)
}

# Função para treinar e avaliar modelos
def evaluate_models(models, X_train, y_train, X_test, y_test):
    results = []
    for model_name, model in models.items():
        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('model', model)])

        print(f'Treinando o algoritmo: {model_name}')
        start_time = time.time()
        pipeline.fit(X_train, y_train)
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f'Treino finalizado. Tempo gasto no treinamento: {elapsed_time:.2f} segundos')

        # Previsões no treino
        train_predictions = pipeline.predict(X_train)
        train_rmse = np.sqrt(mean_squared_error(y_train, train_predictions))
        train_mae = mean_absolute_error(y_train, train_predictions)

        # Previsões no teste
        test_predictions = pipeline.predict(X_test)
        test_rmse = np.sqrt(mean_squared_error(y_test, test_predictions))
        test_mae = mean_absolute_error(y_test, test_predictions)

        results.append((model_name, train_rmse, test_rmse, train_mae, test_mae))
        print(f"RMSE no treino: {train_rmse} \n"
              f"RMSE no teste: {test_rmse} \n"
              f"MAE no treino: {train_mae} \n"
              f"MAE no teste: {test_mae} \n"
              f"{70*'-'}")

    return results

# Avaliar os modelos
results = evaluate_models(models, X_train, y_train, X_test, y_test)

# Mostrar os resultados
results_df1 = pd.DataFrame(results, columns=['Model', 'Train RMSE', 'Test RMSE', 'Train MAE', 'Test MAE'])

results_df1

Treinando o algoritmo: LinearRegression
Treino finalizado. Tempo gasto no treinamento: 1.13 segundos
RMSE no treino: 6227.965295199649 
RMSE no teste: 5614.8491399809855 
MAE no treino: 2027.9796218510955 
MAE no teste: 2009.0232289492599 
----------------------------------------------------------------------
Treinando o algoritmo: LightGBM
Treino finalizado. Tempo gasto no treinamento: 2.86 segundos




RMSE no treino: 2594.3366095389156 
RMSE no teste: 3395.2101823212734 
MAE no treino: 766.5914557714469 
MAE no teste: 952.2983044909193 
----------------------------------------------------------------------
Treinando o algoritmo: CatBoost
Treino finalizado. Tempo gasto no treinamento: 3.29 segundos
RMSE no treino: 3214.907866507169 
RMSE no teste: 2949.3354982021833 
MAE no treino: 964.385948878856 
MAE no teste: 964.7229474758066 
----------------------------------------------------------------------
Treinando o algoritmo: GradientBoosting
Treino finalizado. Tempo gasto no treinamento: 25.39 segundos
RMSE no treino: 3588.3010033230325 
RMSE no teste: 3150.652909418388 
MAE no treino: 1090.5116363582047 
MAE no teste: 1075.6454473963315 
----------------------------------------------------------------------
Treinando o algoritmo: XGBoost
Treino finalizado. Tempo gasto no treinamento: 1.91 segundos
RMSE no treino: 1802.1106538870313 
RMSE no teste: 3687.1047436240424 
MAE no treino: 5

Unnamed: 0,Model,Train RMSE,Test RMSE,Train MAE,Test MAE
0,LinearRegression,6227.965295,5614.84914,2027.979622,2009.023229
1,LightGBM,2594.33661,3395.210182,766.591456,952.298304
2,CatBoost,3214.907867,2949.335498,964.385949,964.722947
3,GradientBoosting,3588.301003,3150.652909,1090.511636,1075.645447
4,XGBoost,1802.110654,3687.104744,559.494301,957.923348


Utilizando todo o histórico, podemos notar que o modelo que apresentou o melhor desempenho foi o CatBoost, pois apresentou melhores resultados no dataset de teste.

### Treinando os modelos com histórico a partir de Agosto/2015

In [None]:
# definindo as features e target
features = [col for col in train.columns if col not in ['id','date','target']]
target = 'target'

train_aux = train[train.date >= '2015-08-01'].copy()

# separanda as colunas numéricas e categóricas
numeric_features = train_aux[features].select_dtypes(include=['int64', 'float64']).columns
categorical_features = train_aux[features].select_dtypes(include=['object']).columns

train_aux.loc[:,categorical_features] = train_aux[categorical_features].astype(str)
train_aux.loc[:,categorical_features] = train_aux[categorical_features].astype(str)

X_train = train_aux[features]
y_train = train_aux[target]
X_test = val[features]
y_test = val[target]

# preprocessamento para dados numéricos e categóricos
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# definindo os modelos a serem testados
models = {
    'LinearRegression': LinearRegression(),
    'LightGBM': lgb.LGBMRegressor(n_estimators=100, random_state=42, verbose=-1),
    'CatBoost': cb.CatBoostRegressor(iterations=100, depth=6, learning_rate=0.1, loss_function='RMSE', verbose=0),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42, verbosity=0)
}

# fFunção para treinar e avaliar modelos
def evaluate_models(models, X_train, y_train, X_test, y_test):
    results = []
    for model_name, model in models.items():
        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('model', model)])

        print(f'Treinando o algoritmo: {model_name}')
        start_time = time.time()
        pipeline.fit(X_train, y_train)
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f'Treino finalizado. Tempo gasto no treinamento: {elapsed_time:.2f} segundos')

        # Previsões no treino
        train_predictions = pipeline.predict(X_train)
        train_rmse = np.sqrt(mean_squared_error(y_train, train_predictions))
        train_mae = mean_absolute_error(y_train, train_predictions)

        # Previsões no teste
        test_predictions = pipeline.predict(X_test)
        test_rmse = np.sqrt(mean_squared_error(y_test, test_predictions))
        test_mae = mean_absolute_error(y_test, test_predictions)

        results.append((model_name, train_rmse, test_rmse, train_mae, test_mae))
        print(f"RMSE no treino: {train_rmse} \n"
              f"RMSE no teste: {test_rmse} \n"
              f"MAE no treino: {train_mae} \n"
              f"MAE no teste: {test_mae} \n"
              f"{70*'-'}")

    return results

# avaliando os modelos
results = evaluate_models(models, X_train, y_train, X_test, y_test)

# mostrando os resultados
results_df2 = pd.DataFrame(results, columns=['Model', 'Train RMSE', 'Test RMSE', 'Train MAE', 'Test MAE'])

results_df2

Treinando o algoritmo: LinearRegression
Treino finalizado. Tempo gasto no treinamento: 0.53 segundos
RMSE no treino: 7233.67116318171 
RMSE no teste: 5776.234714358248 
MAE no treino: 2420.663548260456 
MAE no teste: 2363.147449902674 
----------------------------------------------------------------------
Treinando o algoritmo: LightGBM
Treino finalizado. Tempo gasto no treinamento: 1.26 segundos




RMSE no treino: 2351.4944039586885 
RMSE no teste: 2735.830075696854 
MAE no treino: 699.0339828249677 
MAE no teste: 850.6510977219974 
----------------------------------------------------------------------
Treinando o algoritmo: CatBoost
Treino finalizado. Tempo gasto no treinamento: 6.35 segundos
RMSE no treino: 2879.532349687366 
RMSE no teste: 2633.653199879704 
MAE no treino: 913.2994275069191 
MAE no teste: 930.0960299294978 
----------------------------------------------------------------------
Treinando o algoritmo: GradientBoosting
Treino finalizado. Tempo gasto no treinamento: 11.98 segundos
RMSE no treino: 3176.376819870026 
RMSE no teste: 2675.0555633459485 
MAE no treino: 998.8144507276083 
MAE no teste: 968.5444481452926 
----------------------------------------------------------------------
Treinando o algoritmo: XGBoost
Treino finalizado. Tempo gasto no treinamento: 0.95 segundos
RMSE no treino: 1298.6084996633426 
RMSE no teste: 2343.82544461025 
MAE no treino: 453.48

Unnamed: 0,Model,Train RMSE,Test RMSE,Train MAE,Test MAE
0,LinearRegression,7233.671163,5776.234714,2420.663548,2363.14745
1,LightGBM,2351.494404,2735.830076,699.033983,850.651098
2,CatBoost,2879.53235,2633.6532,913.299428,930.09603
3,GradientBoosting,3176.37682,2675.055563,998.814451,968.544448
4,XGBoost,1298.6085,2343.825445,453.487509,791.923877


Utilizando um histórica mais recente, sem as grandes oscilações iniciais, o modelo XGBoost apresentou um melhor desempenho que os demais.

### Treinando os modelos com histórico a partir de Janeiro/2016

In [None]:
# Definir features e target
features = [col for col in train.columns if col not in ['id','date','target']]
target = 'target'

train_aux = train[train.date >= '2016-01-01'].copy()

# Separar as colunas numéricas e categóricas
numeric_features = train_aux[features].select_dtypes(include=['int64', 'float64']).columns
categorical_features = train_aux[features].select_dtypes(include=['object']).columns

train_aux.loc[:,categorical_features] = train_aux[categorical_features].astype(str)
train_aux.loc[:,categorical_features] = train_aux[categorical_features].astype(str)

X_train = train_aux[features]
y_train = train_aux[target]
X_test = val[features]
y_test = val[target]

# Preprocessamento para dados numéricos e categóricos
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Definir os modelos a serem testados
models = {
    'LinearRegression': LinearRegression(),
    'LightGBM': lgb.LGBMRegressor(n_estimators=100, random_state=42, verbose=-1),
    'CatBoost': cb.CatBoostRegressor(iterations=100, depth=6, learning_rate=0.1, loss_function='RMSE', verbose=0),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42, verbosity=0)
}

# Função para treinar e avaliar modelos
def evaluate_models(models, X_train, y_train, X_test, y_test):
    results = []
    for model_name, model in models.items():
        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('model', model)])

        print(f'Treinando o algoritmo: {model_name}')
        start_time = time.time()
        pipeline.fit(X_train, y_train)
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f'Treino finalizado. Tempo gasto no treinamento: {elapsed_time:.2f} segundos')

        # Previsões no treino
        train_predictions = pipeline.predict(X_train)
        train_rmse = np.sqrt(mean_squared_error(y_train, train_predictions))
        train_mae = mean_absolute_error(y_train, train_predictions)

        # Previsões no teste
        test_predictions = pipeline.predict(X_test)
        test_rmse = np.sqrt(mean_squared_error(y_test, test_predictions))
        test_mae = mean_absolute_error(y_test, test_predictions)

        results.append((model_name, train_rmse, test_rmse, train_mae, test_mae))
        print(f"RMSE no treino: {train_rmse} \n"
              f"RMSE no teste: {test_rmse} \n"
              f"MAE no treino: {train_mae} \n"
              f"MAE no teste: {test_mae} \n"
              f"{70*'-'}")

    return results

# Avaliar os modelos
results = evaluate_models(models, X_train, y_train, X_test, y_test)

# Mostrar os resultados
results_df3 = pd.DataFrame(results, columns=['Model', 'Train RMSE', 'Test RMSE', 'Train MAE', 'Test MAE'])

results_df3

Treinando o algoritmo: LinearRegression
Treino finalizado. Tempo gasto no treinamento: 0.70 segundos
RMSE no treino: 7659.376773930165 
RMSE no teste: 6069.931622102596 
MAE no treino: 2578.402182591449 
MAE no teste: 2529.5472536937646 
----------------------------------------------------------------------
Treinando o algoritmo: LightGBM
Treino finalizado. Tempo gasto no treinamento: 1.21 segundos




RMSE no treino: 2347.227728877587 
RMSE no teste: 2853.7085593665984 
MAE no treino: 659.0039986647597 
MAE no teste: 876.1913035182887 
----------------------------------------------------------------------
Treinando o algoritmo: CatBoost
Treino finalizado. Tempo gasto no treinamento: 2.46 segundos
RMSE no treino: 2921.3070204413325 
RMSE no teste: 2748.5306704441796 
MAE no treino: 911.7102436950555 
MAE no teste: 965.220091541464 
----------------------------------------------------------------------
Treinando o algoritmo: GradientBoosting
Treino finalizado. Tempo gasto no treinamento: 8.25 segundos
RMSE no treino: 3219.7592657091514 
RMSE no teste: 2761.1433131731155 
MAE no treino: 990.0483938369968 
MAE no teste: 1000.163632149644 
----------------------------------------------------------------------
Treinando o algoritmo: XGBoost
Treino finalizado. Tempo gasto no treinamento: 1.37 segundos
RMSE no treino: 1146.4273161422423 
RMSE no teste: 2383.39876767808 
MAE no treino: 393.7

Unnamed: 0,Model,Train RMSE,Test RMSE,Train MAE,Test MAE
0,LinearRegression,7659.376774,6069.931622,2578.402183,2529.547254
1,LightGBM,2347.227729,2853.708559,659.003999,876.191304
2,CatBoost,2921.30702,2748.53067,911.710244,965.220092
3,GradientBoosting,3219.759266,2761.143313,990.048394,1000.163632
4,XGBoost,1146.427316,2383.398768,393.789249,806.660082


Utilizando este período também obtemos como melhor modelo o XGBoost.

### Comparando o resultado de todos os testes

Modelo vigente

In [None]:
results_df1_vigente

Unnamed: 0,Model,Train RMSE,Test RMSE,Train MAE,Test MAE
0,Modelo Vigente,11089.268817,12486.56653,3387.497013,3333.040899


Teste 1

In [None]:
results_df1

Unnamed: 0,Model,Train RMSE,Test RMSE,Train MAE,Test MAE
0,LinearRegression,6227.965295,5614.84914,2027.979622,2009.023229
1,LightGBM,2594.33661,3395.210182,766.591456,952.298304
2,CatBoost,3214.907867,2949.335498,964.385949,964.722947
3,GradientBoosting,3588.301003,3150.652909,1090.511636,1075.645447
4,XGBoost,1802.110654,3687.104744,559.494301,957.923348


Teste 2

In [None]:
results_df2

Unnamed: 0,Model,Train RMSE,Test RMSE,Train MAE,Test MAE
0,LinearRegression,7233.671163,5776.234714,2420.663548,2363.14745
1,LightGBM,2351.494404,2735.830076,699.033983,850.651098
2,CatBoost,2879.53235,2633.6532,913.299428,930.09603
3,GradientBoosting,3176.37682,2675.055563,998.814451,968.544448
4,XGBoost,1298.6085,2343.825445,453.487509,791.923877


Teste 3

In [None]:
results_df3

Unnamed: 0,Model,Train RMSE,Test RMSE,Train MAE,Test MAE
0,LinearRegression,7659.376774,6069.931622,2578.402183,2529.547254
1,LightGBM,2347.227729,2853.708559,659.003999,876.191304
2,CatBoost,2921.30702,2748.53067,911.710244,965.220092
3,GradientBoosting,3219.759266,2761.143313,990.048394,1000.163632
4,XGBoost,1146.427316,2383.398768,393.789249,806.660082


O XGBoost foi consideravelmente melhor em 2 dos 3 testes, então será o modelo utilizado.

Além disso, ele também superou os resultados obtidos com o modelo vigente.

## Salvando o melhor modelo

In [None]:
X_train = train[features]
y_train = train[target]
X_test = val[features]
y_test = val[target]

# Preprocessamento para dados numéricos e categóricos
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Definir a configuração do melhor modelo
best_model = XGBRegressor(n_estimators=100, random_state=42, verbosity=0)

pipeline_best_model = Pipeline(steps=[('preprocessor', preprocessor),
                            ('model', best_model)])

# Treinando o modelo
pipeline_best_model.fit(X_train, y_train)

# Salvando o pipeline em um arquivo .pkl
joblib.dump(pipeline_best_model, '/content/drive/MyDrive/2 - Projetos Portfolio/Previsão de demanda/Base de dados/pipeline_best_model.pkl')

['/content/drive/MyDrive/2 - Projetos Portfolio/Previsão de demanda/Base de dados/pipeline_best_model.pkl']

In [None]:
# Conferindo o desempenho da versão salva

results = []

# Previsões no teste
test_predictions = pipeline_best_model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, test_predictions))
test_mae = mean_absolute_error(y_test, test_predictions)

results.append((model_name, train_rmse, test_rmse, train_mae, test_mae))
print(f"RMSE no treino: {train_rmse} \n"
      f"RMSE no teste: {test_rmse} \n"
      f"MAE no treino: {train_mae} \n"
      f"MAE no teste: {test_mae} \n"
      f"{70*'-'}")

RMSE no treino: 11089.268817420618 
RMSE no teste: 3687.1047436240424 
MAE no treino: 3387.497012672355 
MAE no teste: 957.9233483869137 
----------------------------------------------------------------------
