<a href="https://colab.research.google.com/github/Murilosarto/SQL-Study/blob/main/Forecasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Importação da Biblioteca
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import train_test_split,TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import holidays
import plotly.graph_objects as go

In [2]:
df = pd.read_csv('/content/Check.csv',sep=';')

In [3]:
df = df.groupby(['DTEE'])['TRANSACTIONS_TOTAL'].sum().reset_index()

In [4]:
df['DTEE'] = df['DTEE'].apply(lambda x: datetime.strptime(x, '%d/%m/%Y'))

In [5]:
df['Year'] = df['DTEE'].dt.year
df['Month'] = df['DTEE'].dt.month
df['Day'] = df['DTEE'].dt.day
df['DayOfWeek'] = df['DTEE'].dt.weekday

In [6]:
br_holidays = holidays.Brazil(years=[2020,2022,2023,2024,2025])

df['IsHoliday'] = df['DTEE'].apply(lambda x: 1 if x in br_holidays else 0)
df['IsNewYearOrEnd'] = df['DTEE'].apply(lambda x: 1 if x.strftime('%m-%d') in ['12-31','01-01'] else 0)
df['IsWeekday'] = df['DayOfWeek'].apply(lambda x: 1 if x<5 else 0)

df = pd.get_dummies(df, columns=['Month','Day', 'DayOfWeek'],drop_first=True)

In [7]:
lags = [1,7,30]
for lag in lags:
  df[f'Lag_{lag}'] = df['TRANSACTIONS_TOTAL'].shift(lag)

In [8]:
df['MovingAvg_7'] = df['TRANSACTIONS_TOTAL'].rolling(window=7).mean()
df['MovingAvg_30'] = df['TRANSACTIONS_TOTAL'].rolling(window=30).mean()

In [9]:
df.dropna(inplace=True)

In [10]:
x = df.drop(columns=['TRANSACTIONS_TOTAL','DTEE'])
y = df['TRANSACTIONS_TOTAL']

In [11]:
train_size = int(len(df)*0.8)
x_train,x_test = x.iloc[:train_size],x.iloc[train_size:]
y_train,y_test = y.iloc[:train_size],y.iloc[train_size:]

In [12]:
from sklearn.ensemble import RandomForestRegressor

In [13]:
models = {
    'RandomForest': RandomForestRegressor(n_estimators=200,random_state=42),
    'XGBoost': XGBRegressor(n_estimators=200,learning_rate=0.1,objective='reg:squarederror')
}

In [14]:
for name,model in models.items():
  model.fit(x_train,y_train)
  predictions = model.predict(x_test)

  rmse = np.sqrt(mean_squared_error(y_test,predictions))
  mae = mean_absolute_error(y_test,predictions)

In [15]:
best_model = XGBRegressor(n_estimators=200,learning_rate=0.1,objective='reg:squarederror')
best_model.fit(x_train,y_train)

In [16]:
y_pred = best_model.predict(x_test)

In [17]:
df = df.sort_values('DTEE')

In [18]:
fig = go.Figure()
trace1 = go.Scatter(x=df['DTEE'].iloc[train_size:], y=y_test, name="test", mode="lines")
trace2 = go.Scatter(x=df['DTEE'].iloc[train_size:], y=y_pred, name="prediction", mode="lines")
fig.add_trace(trace1)
fig.add_trace(trace2)
fig.update_layout(
    title="Real value vs predicted in test data",
    xaxis_title="Date time",
    yaxis_title="TRANSACTIONS",
    width=750,
    height=350,
    margin=dict(l=20, r=20, t=35, b=20),
    legend=dict(orientation="h", yanchor="top", y=1.1, xanchor="left", x=0.001)
)
fig.show(renderer='colab')

In [19]:
future_df = pd.data_range(start=df['DTEE'].max()+pd.Timedelta(days=1),periods=30,freq='D')

future_df = pd.DataFrame({'Date':future_df})
future_df['Year'] = future_df['Date'].dt.year
future_df['Month'] = future_df['Date'].dt.month
future_df['Day'] = future_df['Date'].dt.day
future_df['DayOfWeek'] = future_df['Date'].dt.weekday
future_df['IsHoliday'] = future_df['Date'].apply(lambda x: 1 if x in br_holidays else 0)
future_df['IsNewYearOrEnd'] = future_df['Date'].apply(lambda x: 1 if x.strftime('%m-%d') in ['12-31','01-01'] else 0)
future_df['IsWeekday'] = future_df['Date'].apply(lambda x: 1 if x<5 else 0)

future_df = pd.get_dummies(future_df, columns=['Month','Day', 'DayOfWeek'],drop_first=True)

AttributeError: module 'pandas' has no attribute 'data_range'