In [1]:
import pandas as pd
import numpy as np
import joblib
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error


In [3]:
train = pd.read_csv(r"D:\PROJECT_6\final_store_train_data.csv")
test = pd.read_csv(r"D:\PROJECT_6\final_store_test_data.csv")


train['Date'] = pd.to_datetime(train['Date'])

train = train.sort_values(['Store', 'Date'])

In [4]:
# Feature Engineering & Feature Extraction

# Date Features
train['Year'] = train['Date'].dt.year
train['Month'] = train['Date'].dt.month
train['Week'] = train['Date'].dt.isocalendar().week.astype(int)
train['Day'] = train['Date'].dt.day
train['DayOfWeek'] = train['Date'].dt.dayofweek
train['IsWeekend'] = train['DayOfWeek'].isin([5,6]).astype(int)
train['Quarter'] = train['Date'].dt.quarter
train['IsMonthStart'] = train['Date'].dt.is_month_start.astype(int)
train['IsMonthEnd'] = train['Date'].dt.is_month_end.astype(int)

In [5]:
#  Lag Features (for Sales)

train['Lag_1'] = train.groupby('Store')['Sales'].shift(1)
train['Lag_7'] = train.groupby('Store')['Sales'].shift(7)
train['Rolling_7'] = train.groupby('Store')['Sales'].shift(1).rolling(7).mean()

train = train.dropna()

In [6]:
#  Remove Closed Stores & Zero Sales

train = train[(train['Open'] == 1) & (train['Sales'] > 0)]

In [7]:
#  Remove Leakage Columns

# Customers not available in test â†’ remove
X = train.drop(['Sales', 'Date', 'Customers'], axis=1)
y = train['Sales']


In [8]:
#  Convert Categorical Variables

X = pd.get_dummies(X, drop_first=True)

In [9]:
#  Train Validation Split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [10]:
#  Standardization

standard_scaler = StandardScaler()

X_train_std = standard_scaler.fit_transform(X_train)
X_val_std = standard_scaler.transform(X_val)

#  Normalization (Min-Max Scaling)

minmax_scaler = MinMaxScaler()

X_train_scaled = minmax_scaler.fit_transform(X_train_std)
X_val_scaled = minmax_scaler.transform(X_val_std)


In [11]:
rf_model = RandomForestRegressor()

params = rf_model.get_params()

for key in sorted(params.keys()):
    print(f"{key:<25} : {params[key]}")

bootstrap                 : True
ccp_alpha                 : 0.0
criterion                 : squared_error
max_depth                 : None
max_features              : 1.0
max_leaf_nodes            : None
max_samples               : None
min_impurity_decrease     : 0.0
min_samples_leaf          : 1
min_samples_split         : 2
min_weight_fraction_leaf  : 0.0
monotonic_cst             : None
n_estimators              : 100
n_jobs                    : None
oob_score                 : False
random_state              : None
verbose                   : 0
warm_start                : False


In [12]:
# Train Model

rf_model = RandomForestRegressor(
    n_estimators=500,
    max_depth=30,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train_scaled, y_train)

#  MAPE Function

def safe_mape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

# Predict

y_pred = rf_model.predict(X_val_scaled)

# Evalute

mape = safe_mape(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
r2 = r2_score(y_val, y_pred)

print("FINAL RANDOM FOREST MODEL ")
print("MAPE :", round(mape, 2), "%")
print("RMSE :", round(rmse, 2))
print("R2 Score :", round(r2, 4))


FINAL RANDOM FOREST MODEL 
MAPE : 8.58 %
RMSE : 733.82
R2 Score : 0.918


In [13]:
print("Minimum Sales:", y_val.min())
print("Number of Zero Sales:", (y_val == 0).sum())


Minimum Sales: 297.0
Number of Zero Sales: 0


In [14]:
# Save trained model
joblib.dump(rf_model, "random_forest_model.pkl")

print("Model saved successfully!")


Model saved successfully!


In [15]:
#  Actual vs Predicted Comparison

results = pd.DataFrame({
    "Actual_Sales": y_val.values,
    "Predicted_Sales": y_pred
})

results["Error"] = results["Actual_Sales"] - results["Predicted_Sales"]
results["Absolute_Error"] = np.abs(results["Error"])
results["Percentage_Error_%"] = (
    results["Absolute_Error"] / results["Actual_Sales"]
) * 100

print("\nSAMPLE PREDICTIONS\n")
print(results.head(10))




SAMPLE PREDICTIONS

   Actual_Sales  Predicted_Sales        Error  Absolute_Error  \
0       10205.0     10100.728402   104.271598      104.271598   
1        6160.0      6747.030776  -587.030776      587.030776   
2       12220.0     13215.898923  -995.898923      995.898923   
3        3043.0      3388.562141  -345.562141      345.562141   
4        7882.0      9532.732605 -1650.732605     1650.732605   
5        4668.0      4976.850582  -308.850582      308.850582   
6        3738.0      4215.286741  -477.286741      477.286741   
7        6075.0      6170.510636   -95.510636       95.510636   
8        2298.0      2356.526144   -58.526144       58.526144   
9        9278.0      8963.916765   314.083235      314.083235   

   Percentage_Error_%  
0            1.021770  
1            9.529720  
2            8.149746  
3           11.355969  
4           20.943068  
5            6.616336  
6           12.768506  
7            1.572192  
8            2.546830  
9            3.385247  

In [16]:
feature_importance = pd.DataFrame({
    "Feature": X_train.columns,
    "Importance": rf_model.feature_importances_
})

feature_importance = feature_importance.sort_values(
    by="Importance",
    ascending=False
)

print("\nTOP 15 IMPORTANT FEATURES\n")
print(feature_importance.head(15))




TOP 15 IMPORTANT FEATURES

                      Feature  Importance
21                  Rolling_7    0.418457
19                      Lag_1    0.180073
9                       Promo    0.175336
20                      Lag_7    0.081765
14                        Day    0.027701
7                   DayOfWeek    0.026798
13                       Week    0.022012
1         CompetitionDistance    0.012611
0                       Store    0.009751
5             Promo2SinceWeek    0.006426
6             Promo2SinceYear    0.005152
3    CompetitionOpenSinceYear    0.005039
2   CompetitionOpenSinceMonth    0.004791
12                      Month    0.004453
11                       Year    0.003749


In [17]:
# Save feature importance
feature_importance.to_csv("rf_feature_importance.csv", index=False)
print("RF_feature_importance saved successfully")

RF_feature_importance saved successfully
