<a href="https://colab.research.google.com/github/Murilosarto/SQL-Study/blob/main/Forecasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Importação da Biblioteca
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import train_test_split,TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import holidays
import plotly.graph_objects as go
from lightgbm import LGBMRegressor
from statsmodels.tsa.statespace.sarimax import SARIMAX
from prophet import Prophet
import time
from sklearn.ensemble import RandomForestRegressor
import itertools
from skforecast.recursive import ForecasterRecursive
from skforecast.model_selection import grid_search_forecaster
from skforecast.model_selection import TimeSeriesFold

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
df = pd.read_csv('/content/Check.csv',sep=';')

In [3]:
df = df[df['NAID'] != 'BRU70']

In [5]:
df['Canal'] = df.apply(lambda x: 'Digital' if x['NAID'] == 'BRA02' else 'Own Stores' if
                       x['NAID'] == 'BR996' else 'Retail',axis=1)

In [9]:
df['Corredor'] = df['CORRIDOR'].apply(lambda x: 'OTHERS' if x != 'AR' and x != 'CO' and x != 'BO' and
                                          x != 'MZ' and x != 'US' and x != 'PT' and
                                          x != 'CL' and x != 'PY' and x != 'PE' and x != 'HT' else x)

In [6]:
df = df.groupby(['DTEE'])['TRANSACTIONS_TOTAL'].sum().reset_index()

In [6]:
df['DTEE'] = df['DTEE'].apply(lambda x: datetime.strptime(x, '%d/%m/%Y'))

In [8]:
df['Year'] = df['DTEE'].dt.year
df['Month'] = df['DTEE'].dt.month
df['Day'] = df['DTEE'].dt.day
df['DayOfWeek'] = df['DTEE'].dt.weekday

In [7]:
br_holidays = holidays.Brazil(years=[2020,2022,2023,2024,2025])

In [9]:
br_holidays = holidays.Brazil(years=[2020,2022,2023,2024,2025])

df['IsHoliday'] = df['DTEE'].apply(lambda x: 1 if x in br_holidays else 0)
df['NewYearImpact'] = df['DTEE'].apply(lambda x: 1 if x.strftime('%m-%d') in ['12-31','01-01'] else 0)
df['IsWeekday'] = df['DayOfWeek'].apply(lambda x: 1 if x<5 else 0)

df = pd.get_dummies(df, columns=['Month','Day', 'DayOfWeek'],drop_first=True)

In [10]:
lags = [1,7,30]
for lag in lags:
  df[f'Lag_{lag}'] = df['TRANSACTIONS_TOTAL'].shift(lag)

In [11]:
df['MovingAvg_7'] = df['TRANSACTIONS_TOTAL'].rolling(window=7).mean()
df['MovingAvg_30'] = df['TRANSACTIONS_TOTAL'].rolling(window=30).mean()

In [12]:
df.dropna(inplace=True)

In [13]:
x = df.drop(columns=['TRANSACTIONS_TOTAL','DTEE'])
y = df['TRANSACTIONS_TOTAL']

In [14]:
train_size = int(len(df)*0.8)
x_train,x_test = x.iloc[:train_size],x.iloc[train_size:]
y_train,y_test = y.iloc[:train_size],y.iloc[train_size:]

In [15]:
from sklearn.ensemble import RandomForestRegressor

In [16]:
models = {
    'RandomForest': RandomForestRegressor(n_estimators=200,random_state=42),
    'XGBoost': XGBRegressor(n_estimators=200,learning_rate=0.1,objective='reg:squarederror')
}

In [17]:
for name,model in models.items():
  model.fit(x_train,y_train)
  predictions = model.predict(x_test)

  rmse = np.sqrt(mean_squared_error(y_test,predictions))
  mae = mean_absolute_error(y_test,predictions)

In [18]:
best_model = XGBRegressor(n_estimators=200,learning_rate=0.1,objective='reg:squarederror')
best_model.fit(x_train,y_train)

In [19]:
y_pred = best_model.predict(x_test)

In [20]:
df = df.sort_values('DTEE')

In [21]:
fig = go.Figure()
trace1 = go.Scatter(x=df['DTEE'].iloc[train_size:], y=y_test, name="test", mode="lines")
trace2 = go.Scatter(x=df['DTEE'].iloc[train_size:], y=y_pred, name="prediction", mode="lines")
fig.add_trace(trace1)
fig.add_trace(trace2)
fig.update_layout(
    title="Real value vs predicted in test data",
    xaxis_title="Date time",
    yaxis_title="TRANSACTIONS",
    width=750,
    height=350,
    margin=dict(l=20, r=20, t=35, b=20),
    legend=dict(orientation="h", yanchor="top", y=1.1, xanchor="left", x=0.001)
)
fig.show(renderer='colab')

In [22]:
future_df = pd.date_range(start="2025-01-01",periods=365,freq='D')

future_df = pd.DataFrame({'Date':future_df})
future_df['Year'] = future_df['Date'].dt.year
future_df['Month'] = future_df['Date'].dt.month
future_df['Day'] = future_df['Date'].dt.day
future_df['DayOfWeek'] = future_df['Date'].dt.weekday
future_df['IsHoliday'] = future_df['Date'].apply(lambda x: 1 if x in br_holidays else 0)
future_df['IsNewYearOrEnd'] = future_df['Date'].apply(lambda x: 1 if x.strftime('%m-%d') in ['12-31','01-01'] else 0)
future_df['IsWeekday'] = future_df['DayOfWeek'].apply(lambda x: 1 if x<5 else 0)

In [23]:
future_df = pd.get_dummies(future_df, columns=['Month','Day', 'DayOfWeek'],drop_first=True)

In [24]:
for lag in lags:
  future_df[f'Lag_{lag}'] = y.iloc[-lag]

future_df['MovingAvg_7'] = y.rolling(window=7).mean().iloc[-1]
future_df['MovingAvg_30'] = y.rolling(window=30).mean().iloc[-1]

future_df.fillna(0,inplace=True)

x_future = future_df.drop(columns=['Date'])

x_future,_ = x_future.align(x_train,join='left',axis=1, fill_value=0)

In [25]:
x_future = x_future.reindex(columns=x_train.columns,fill_value=0)

In [26]:
missing_in_future = set(x_train.columns)-set(x_future.columns)
extra_in_future = set(x_future.columns)-set(x_train.columns)

print(f'Missing in future: {missing_in_future}')
print(f'Extra in future: {extra_in_future}')

Missing in future: set()
Extra in future: set()


In [27]:
future_df['Predicted_transactions']=best_model.predict(x_future)

In [28]:
future_df['Predicted_transactions'].sum()

2158733.8

In [29]:
future_df.to_excel('/content/Check.xlsx')

In [59]:
import pandas as pd
import numpy as np
import itertools
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import plotly.graph_objects as go
# Criar um DataFrame para armazenar todas as previsões
all_predictions = []

# Listar todos os canais e corredores únicos
canais = df['Canal'].unique()
corredores = df['Corredor'].unique()

# Iterar sobre cada combinação de Canal e Corredor
for canal, corredor in itertools.product(canais, corredores):
   print(f"TREINANDO MODELO PARA: Canal={canal} | Corredor={corredor}")

   # Filtrar os dados
   df_filtered = df[(df['Canal'] == canal) & (df['Corredor'] == corredor)].copy()

   # Converter a coluna de data para datetime
   df_filtered['Date'] = df_filtered['DTEE']
   df_filtered.sort_values('Date', inplace=True)

   df_filtered = df_filtered.groupby(['Date'])['TRANSACTIONS_TOTAL'].sum().reset_index()

   # Criar variáveis temporais
   df_filtered['Year'] = df_filtered['Date'].dt.year
   df_filtered['Month'] = df_filtered['Date'].dt.month
   df_filtered['Day'] = df_filtered['Date'].dt.day
   df_filtered['DayOfWeek'] = df_filtered['Date'].dt.weekday
   df_filtered['IsWeekend'] = (df_filtered['DayOfWeek'] >= 5).astype(int)
   df_filtered['IsHoliday'] = df_filtered['Date'].apply(lambda x: 1 if x in br_holidays else 0)
   df_filtered['NewYearImpact'] = df_filtered['Date'].apply(lambda x: 1 if x.strftime('%m-%d') in ['12-31','01-01'] else 0)
   df_filtered['IsWeekday'] = df_filtered['DayOfWeek'].apply(lambda x: 1 if x<5 else 0)

   # Criar dummies para variáveis categóricas
   df_filtered = pd.get_dummies(df_filtered, columns=['Month', 'Day', 'DayOfWeek'], drop_first=True)

   # Criar lags
   lags = [1, 7, 14, 21, 30]  # Você pode ajustar conforme necessário
   for lag in lags:
       df_filtered[f'Lag_{lag}'] = df_filtered['TRANSACTIONS_TOTAL'].shift(lag)

   # Criar médias móveis
   df_filtered['MovingAvg_7'] = df_filtered['TRANSACTIONS_TOTAL'].rolling(window=7).mean()
   df_filtered['MovingAvg_30'] = df_filtered['TRANSACTIONS_TOTAL'].rolling(window=30).mean()

   # Remover valores nulos gerados pelos lags e médias móveis
   df_filtered.dropna(inplace=True)

   # Separar variáveis explicativas e alvo
   X = df_filtered.drop(columns=['TRANSACTIONS_TOTAL', 'Date'])
   y = df_filtered['TRANSACTIONS_TOTAL']

   # Separar em treino e teste
   train_size = int(len(df_filtered) * 0.8)
   X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
   y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

   # Criar e treinar o modelo XGBoost
   model = XGBRegressor(n_estimators=200, learning_rate=0.1, objective='reg:squarederror')
   model.fit(X_train, y_train)

   # Criar DataFrame de previsão futura
   future_dates = pd.date_range(start='2025-01-01', periods=365, freq='D')
   future_df = pd.DataFrame({'Date': future_dates})
   future_df['Canal'] = canal
   future_df['Corredor'] = corredor

   # Criar variáveis temporais para previsão futura
   future_df['Year'] = future_df['Date'].dt.year
   future_df['Month'] = future_df['Date'].dt.month
   future_df['Day'] = future_df['Date'].dt.day
   future_df['DayOfWeek'] = future_df['Date'].dt.weekday
   future_df['IsWeekend'] = (future_df['DayOfWeek'] >= 5).astype(int)
   future_df['IsHoliday'] = future_df['Date'].apply(lambda x: 1 if x in br_holidays else 0)
   future_df['NewYearImpact'] = future_df['Date'].apply(lambda x: 1 if x.strftime('%m-%d') in ['12-31','01-01'] else 0)
   future_df['IsWeekday'] = future_df['DayOfWeek'].apply(lambda x: 1 if x<5 else 0)

   # Criar dummies para o conjunto futuro
   future_df = pd.get_dummies(future_df, columns=['Month', 'Day', 'DayOfWeek'], drop_first=True)

   # Criar lags usando os últimos valores conhecidos
   for lag in lags:
       future_df[f'Lag_{lag}'] = y.iloc[-lag] if len(y) >= lag else y.mean()

   # Criar médias móveis
   future_df['MovingAvg_7'] = y.rolling(window=7).mean().iloc[-1]
   future_df['MovingAvg_30'] = y.rolling(window=30).mean().iloc[-1]

   # Garantir que não haja NaNs
   future_df.fillna(0, inplace=True)

   check = future_df.reset_index()
   check = check[['index','Date']]

   # Alinhar colunas do futuro com treino para evitar erro de features
   future_df = future_df.reindex(columns=X_train.columns, fill_value=0)

   # Fazer a previsão
   future_df['Predicted_Transactions'] = model.predict(future_df)
   future_df.reset_index(inplace=True)
   future_df = future_df.merge(check,on='index',how='left')
   future_df['Canal'] = canal
   future_df['Corridor'] = corredor

   # Armazenar os resultados
   all_predictions.append(future_df)

# Unir todas as previsões em um único DataFrame final
final_predictions = pd.concat(all_predictions)

TREINANDO MODELO PARA: Canal=Own Stores | Corredor=OTHERS
TREINANDO MODELO PARA: Canal=Own Stores | Corredor=HT
TREINANDO MODELO PARA: Canal=Own Stores | Corredor=US
TREINANDO MODELO PARA: Canal=Own Stores | Corredor=CO
TREINANDO MODELO PARA: Canal=Own Stores | Corredor=PT
TREINANDO MODELO PARA: Canal=Own Stores | Corredor=PE
TREINANDO MODELO PARA: Canal=Own Stores | Corredor=BO
TREINANDO MODELO PARA: Canal=Own Stores | Corredor=CL
TREINANDO MODELO PARA: Canal=Own Stores | Corredor=AR
TREINANDO MODELO PARA: Canal=Own Stores | Corredor=PY
TREINANDO MODELO PARA: Canal=Own Stores | Corredor=MZ
TREINANDO MODELO PARA: Canal=Retail | Corredor=OTHERS
TREINANDO MODELO PARA: Canal=Retail | Corredor=HT
TREINANDO MODELO PARA: Canal=Retail | Corredor=US
TREINANDO MODELO PARA: Canal=Retail | Corredor=CO
TREINANDO MODELO PARA: Canal=Retail | Corredor=PT
TREINANDO MODELO PARA: Canal=Retail | Corredor=PE
TREINANDO MODELO PARA: Canal=Retail | Corredor=BO
TREINANDO MODELO PARA: Canal=Retail | Corredor=C

In [60]:
final_predictions.to_excel('/content/Check.xlsx')

In [61]:
final_predictions

Unnamed: 0,index,Year,IsWeekend,IsHoliday,NewYearImpact,IsWeekday,Month_2,Month_3,Month_4,Month_5,...,Lag_7,Lag_14,Lag_21,Lag_30,MovingAvg_7,MovingAvg_30,Predicted_Transactions,Date,Canal,Corridor
0,0,2025,0,1,1,1,False,False,False,False,...,165,165,177,193,201.714286,226.333333,115.696602,2025-01-01,Own Stores,OTHERS
1,1,2025,0,0,0,1,False,False,False,False,...,165,165,177,193,201.714286,226.333333,289.541534,2025-01-02,Own Stores,OTHERS
2,2,2025,0,0,0,1,False,False,False,False,...,165,165,177,193,201.714286,226.333333,287.117126,2025-01-03,Own Stores,OTHERS
3,3,2025,1,0,0,0,False,False,False,False,...,165,165,177,193,201.714286,226.333333,193.137909,2025-01-04,Own Stores,OTHERS
4,4,2025,1,0,0,0,False,False,False,False,...,165,165,177,193,201.714286,226.333333,76.809853,2025-01-05,Own Stores,OTHERS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,360,2025,1,0,0,0,False,False,False,False,...,186,171,187,217,177.285714,192.000000,162.357697,2025-12-27,Digital,MZ
361,361,2025,1,0,0,0,False,False,False,False,...,186,171,187,217,177.285714,192.000000,168.374100,2025-12-28,Digital,MZ
362,362,2025,0,0,0,1,False,False,False,False,...,186,171,187,217,177.285714,192.000000,221.590897,2025-12-29,Digital,MZ
363,363,2025,0,0,0,1,False,False,False,False,...,186,171,187,217,177.285714,192.000000,228.894394,2025-12-30,Digital,MZ
