In [13]:
from statsmodels.tsa.arima.model import ARIMA
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error , r2_score
import numpy as np

In [14]:
import pandas as pd

In [15]:
df = pd.read_parquet('subbrand_grouped.parquet')

In [16]:
df['DATE_FIELD'] = pd.to_datetime(df['DATE_FIELD'])
df['YEAR'] = df['DATE_FIELD'].dt.year
df['MONTH'] = df['DATE_FIELD'].dt.month
df['DAY'] = df['DATE_FIELD'].dt.day

In [17]:
min_sales_volume = 1000  
min_sales_years = 2     
max_zero_sales_percentage = 75

In [18]:
relevant_subbrands = []

In [19]:
for subbrand, subbrand_data in df.groupby('ITM_SBRND_CD'):
    total_sales_volume = subbrand_data['SALES'].sum()
    years_with_sales = subbrand_data['YEAR'].nunique()
    zero_sales_percentage = (subbrand_data['SALES'] == 0).mean() * 100
    
    if (total_sales_volume >= min_sales_volume and
        years_with_sales >= min_sales_years and
        zero_sales_percentage < max_zero_sales_percentage):
        relevant_subbrands.append(subbrand)


In [20]:
df = df[df['ITM_SBRND_CD'].isin(relevant_subbrands)]

In [21]:
X = df[['ITM_SBRND_CD', 'YEAR', 'MONTH', 'DAY']]
y = df['SALES']

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
models_arima = {}
models_xgboost = {}
predictions_arima = {}
predictions_xgboost = {}
evaluation_metrics_arima = {}
evaluation_metrics_xgboost = {}

In [24]:
import matplotlib.pyplot as plt

In [26]:
for group_name, group_data in df.groupby('ITM_SBRND_CD'):
    # Skip groups with fewer than 2 samples
    if len(group_data) < 2:
        continue


    group_data = group_data[['DATE_FIELD', 'SALES']]
    group_data.set_index('DATE_FIELD', inplace=True)


    train_size = int(0.8 * len(group_data))
    train, test = group_data.iloc[:train_size], group_data.iloc[train_size:]


    model_arima = ARIMA(train, order=(5, 1, 0))  # Example order, you may need to adjust
    model_arima_fit = model_arima.fit()


    pred_arima = model_arima_fit.forecast(steps=len(test)).values
    pred_arima = pred_arima[0]


    models_arima[group_name] = model_arima_fit
    predictions_arima[group_name] = {'Actual': test.values.flatten(), 'Predicted': pred_arima}


    model_xgboost = XGBRegressor(n_estimators=100, objective='reg:squarederror')  # Example parameters, you may need to adjust
    model_xgboost.fit(np.arange(len(train)).reshape(-1, 1), train.values.flatten())


    pred_xgboost = model_xgboost.predict(np.arange(len(train), len(train) + len(test)).reshape(-1, 1))

    models_xgboost[group_name] = model_xgboost
    predictions_xgboost[group_name] = {'Actual': test.values.flatten(), 'Predicted': pred_xgboost}

   
    mse_arima = mean_squared_error(test, pred_arima)
    rmse_arima = np.sqrt(mse_arima)
    r2_arima = r2_score(test, pred_arima)
    smape_arima = 100 * np.mean(2 * np.abs(pred_arima - test) / (np.abs(pred_arima) + np.abs(test)))

    evaluation_metrics_arima[group_name] = {'RMSE': rmse_arima, 'R2': r2_arima, 'SMAPE': smape_arima}

    mse_xgboost = mean_squared_error(test, pred_xgboost)
    rmse_xgboost = np.sqrt(mse_xgboost)
    r2_xgboost = r2_score(test, pred_xgboost)
    smape_xgboost = 100 * np.mean(2 * np.abs(pred_xgboost - test) / (np.abs(pred_xgboost) + np.abs(test)))

    evaluation_metrics_xgboost[group_name] = {'RMSE': rmse_xgboost, 'R2': r2_xgboost, 'SMAPE': smape_xgboost}

    print(f'SUBBRND_CODE: {group_name} - ARIMA - RMSE: {rmse_arima:.2f}, R2: {r2_arima:.4f}, SMAPE: {smape_arima:.2f}')
    print(f'SUBBRND_CODE: {group_name} - XGBoost - RMSE: {rmse_xgboost:.2f}, R2: {r2_xgboost:.4f}, SMAPE: {smape_xgboost:.2f}')


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


InvalidParameterError: The 'y_pred' parameter of mean_squared_error must be an array-like. Got 34561.11404396663 instead.

In [None]:
}