In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import STL
from statsmodels.datasets import co2
from datadata import utils
from sklearn.model_selection import train_test_split
import numpy as np
import xgboost as xgb
from prevision import Options,XGBOOST_TYPE,addDates
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

In [None]:
df_sales=pd.read_csv("prevision/data/daily_sales.csv",parse_dates=['Date'] ,index_col='Date').rename_axis('date')
df_meteo=pd.read_csv("prevision/data/meteo.csv",parse_dates=['time'],index_col='time').rename_axis('date').drop('Unnamed: 0',axis=1)
df_predictHQ=pd.read_csv("prevision/data/affluence.csv",parse_dates=['date'],index_col='date')
df_all=pd.merge(pd.merge(df_sales, df_meteo, on='date'), df_predictHQ, on='date').reset_index().dropna()

In [None]:
df_all=addDates(df_all)

In [None]:
df['day'] = df['date'].apply(date2day)
# hot encode day
df = pd.get_dummies(df, columns=['day'])

# Vacances
# On ajoute une colonne
df['vacance'] = df['date'].apply(date2vacances)

# Jours fériés
# On ajoute une colonne
df['ferie'] = df['date'].apply(date2jourferie)

In [None]:
sales_data_non_zero = df_all['vente'].replace(0, np.nan).dropna()
log_sales = np.log(sales_data_non_zero)
stl = STL(log_sales, seasonal=13,period=30*6)
result = stl.fit()

trend = result.trend
seasonal = result.seasonal
residual = result.resid
# Access the components: trend, seasonal, and residual
trend = result.trend
seasonal = result.seasonal
residual = result.resid

# Plot the original time series and the components
plt.figure(figsize=(12, 8))
plt.subplot(4, 1, 1)
plt.plot(df_sales['vente'], label='Original Time Series')
plt.legend()

plt.subplot(4, 1, 2)
plt.plot(trend, label='Trend')
plt.legend()

plt.subplot(4, 1, 3)
plt.plot(seasonal, label='Seasonal')
plt.legend()

plt.subplot(4, 1, 4)
plt.plot(residual, label='Residual')
plt.legend()

plt.tight_layout()
plt.show()

# Split

In [None]:
def split_data_based_on_duration(data, duration_months):
    last_date = data['date'].max()
    cutoff_date = last_date - pd.DateOffset(months=duration_months)
    train_data = data[data['date'] <= cutoff_date]
    validation_data = data[data['date'] > cutoff_date]
    return train_data, validation_data

def getX(x):
    return x.drop(['date','vente'],axis=1).values
def getY(y):
    return y['vente'].values

def plot_eval_result(eval_results):
    # Extract training and validation losses
    train_loss = eval_results['validation_0']['rmse']
    val_loss = eval_results['validation_1']['rmse']

    # Plot the loss
    epochs = len(train_loss)
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, epochs + 1), train_loss, label='Train Loss')
    plt.plot(range(1, epochs + 1), val_loss, label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('RMSE')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.show()

sales_data=df_all
train_data_1yr, validation_data_1yr = split_data_based_on_duration(sales_data, 12)
train_data_6mo, validation_data_6mo = split_data_based_on_duration(sales_data, 6)
train_data_3mo, validation_data_3mo = split_data_based_on_duration(sales_data, 3)

train_data_1yr.reset_index(drop=True, inplace=True)
validation_data_1yr.reset_index(drop=True, inplace=True)
train_data_6mo.reset_index(drop=True, inplace=True)
validation_data_6mo.reset_index(drop=True, inplace=True)
train_data_3mo.reset_index(drop=True, inplace=True)
validation_data_3mo.reset_index(drop=True, inplace=True)

In [None]:
X_train,Y_train=getX(train_data_3mo),getY(train_data_3mo)
X_test,Y_test=getX(validation_data_3mo),getY(validation_data_3mo)

param_space = {
    'n_estimators': np.arange(50, 300, 50),  # Number of boosting rounds
    'max_depth': np.arange(3, 10),  # Maximum depth of the tree
    'learning_rate': np.arange(0.05, 0.31, 0.05),  # Learning rate
    'subsample': np.arange(0.7, 1.0, 0.1),  # Subsample ratio
    'colsample_bytree': np.arange(0.7, 1.0, 0.1),  # Subsample ratio of columns
}
xgb_regressor = xgb.XGBRegressor(random_state=42)
random_search = RandomizedSearchCV(xgb_regressor, param_distributions=param_space,
                                   n_iter=50, scoring='neg_mean_squared_error', cv=3,
                                   random_state=42, verbose=1, n_jobs=-1)

random_search.fit(X_train, Y_train)
best_model = random_search.best_estimator_
val_preds = best_model.predict(X_test)
val_rmse = np.sqrt(mean_squared_error(Y_test, val_preds))
print('Validation RMSE:', val_rmse)
print('best params\n',*random_search.best_params_)

best_model.fit(X_train, Y_train,
               eval_set=[(X_train, Y_train), (X_test, Y_test)],
               eval_metric='rmse',
               verbose=0)
eval_results = best_model.evals_result()
res = best_model.predict(X_test)
plot_eval_result(eval_results)

In [None]:
plt.plot(Y_test,label='réel')
plt.plot(res,label='prediction')
plt.legend()