<a href="https://colab.research.google.com/github/Givi-Modebadze/ML_Final_Project/blob/main/experiments/LightGBM_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

FOLDERNAME = 'ML_Final_Project/walmart-recruiting-store-sales-forecasting'
assert FOLDERNAME is not None, "[!] Enter the foldername."

import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))

%cd /content/drive/My\ Drive/$FOLDERNAME/

Mounted at /content/drive
/content/drive/My Drive/ML_Final_Project/walmart-recruiting-store-sales-forecasting


In [2]:
import numpy as np
import pandas as pd

df = pd.read_csv('train.csv')
stores = pd.read_csv('stores.csv')
features = pd.read_csv('features.csv')
df = df.merge(stores, how='left', left_on='Store', right_on='Store')
df = df.merge(features, how='left', left_on=['Store','Date'], right_on=['Store','Date'])
df.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday_x,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday_y
0,1,1,2010-02-05,24924.5,False,A,151315,42.31,2.572,,,,,,211.096358,8.106,False
1,1,1,2010-02-12,46039.49,True,A,151315,38.51,2.548,,,,,,211.24217,8.106,True
2,1,1,2010-02-19,41595.55,False,A,151315,39.93,2.514,,,,,,211.289143,8.106,False
3,1,1,2010-02-26,19403.54,False,A,151315,46.63,2.561,,,,,,211.319643,8.106,False
4,1,1,2010-03-05,21827.9,False,A,151315,46.5,2.625,,,,,,211.350143,8.106,False


In [3]:
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Week'] = df['Date'].dt.isocalendar().week
df['Quarter'] = df['Date'].dt.quarter
df['DayOfYear'] = df['Date'].dt.dayofyear

In [4]:
markdown_cols = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
for col in markdown_cols:
    df[col] = df[col].fillna(0)

In [5]:
df['CPI'] = df['CPI'].fillna(df['CPI'].median())
df['Unemployment'] = df['Unemployment'].fillna(df['Unemployment'].median())

In [6]:
df['IsHoliday_y'] = df['IsHoliday_y'].astype(int)

df['IsSuperbowl'] = df['Date'].dt.strftime('%Y-%m-%d').isin([
    '2010-02-12', '2011-02-11', '2012-02-10'
]).astype(int)

df['IsLaborDay'] = df['Date'].dt.strftime('%Y-%m-%d').isin([
    '2010-09-10', '2011-09-09', '2012-09-07'
]).astype(int)

df['IsThanksgiving'] = df['Date'].dt.strftime('%Y-%m-%d').isin([
    '2010-11-26', '2011-11-25', '2012-11-23'
]).astype(int)

df['IsChristmas'] = df['Date'].dt.strftime('%Y-%m-%d').isin([
    '2010-12-31', '2011-12-30', '2012-12-28'
]).astype(int)

In [7]:
df = df.sort_values(['Store', 'Dept', 'Date'])

In [8]:
df['Sales_Rolling_Mean_4'] = df.groupby(['Store', 'Dept'])['Weekly_Sales'].transform(
    lambda x: x.rolling(4, min_periods=1).mean()
)
df['Sales_Rolling_Std_4'] = df.groupby(['Store', 'Dept'])['Weekly_Sales'].transform(
    lambda x: x.rolling(4, min_periods=1).std()
)

In [9]:
from sklearn.preprocessing import LabelEncoder

le_type = LabelEncoder()
df['Type_encoded'] = le_type.fit_transform(df['Type'])

df['Store_Dept'] = df['Store'] * 100 + df['Dept']

In [10]:
dept_stats = df.groupby('Dept')['Weekly_Sales'].agg(['mean', 'std']).reset_index()
dept_stats.columns = ['Dept', 'Dept_Sales_Mean', 'Dept_Sales_Std']
df = df.merge(dept_stats, on='Dept', how='left')

store_stats = df.groupby('Store')['Weekly_Sales'].agg(['mean', 'std']).reset_index()
store_stats.columns = ['Store', 'Store_Sales_Mean', 'Store_Sales_Std']
df = df.merge(store_stats, on='Store', how='left')

In [11]:
feature_cols = [
    'Store', 'Dept', 'Size', 'Type_encoded', 'Store_Dept',
    'Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
    'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5',
    'Year', 'Month', 'Week', 'Quarter', 'DayOfYear',
    'IsHoliday_y', 'IsSuperbowl', 'IsLaborDay', 'IsThanksgiving', 'IsChristmas',
    'Sales_Rolling_Mean_4', 'Sales_Rolling_Std_4',
    'Dept_Sales_Mean', 'Dept_Sales_Std',
    'Store_Sales_Mean', 'Store_Sales_Std'
]

In [12]:
X = df[feature_cols]
y = df['Weekly_Sales']

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=df['Store']
)

In [14]:
import lightgbm as lgb

params = {
    'objective': 'regression',
    'metric': 'mae',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'random_state': 42,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1
}
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

In [15]:
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, val_data],
    valid_names=['train', 'val'],
    num_boost_round=1000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]
)

Training until validation scores don't improve for 50 rounds
[100]	train's l1: 1539.46	val's l1: 1548.63
[200]	train's l1: 1305.25	val's l1: 1324.87
[300]	train's l1: 1213.03	val's l1: 1240.39
[400]	train's l1: 1151.21	val's l1: 1186.14
[500]	train's l1: 1109.23	val's l1: 1150.09
[600]	train's l1: 1076.27	val's l1: 1122.36
[700]	train's l1: 1046.38	val's l1: 1098.4
[800]	train's l1: 1024.8	val's l1: 1082.56
[900]	train's l1: 1003.89	val's l1: 1067.19
[1000]	train's l1: 985.603	val's l1: 1053.08
Did not meet early stopping. Best iteration is:
[1000]	train's l1: 985.603	val's l1: 1053.08


In [16]:
from sklearn.metrics import mean_absolute_error

y_pred = model.predict(X_val, num_iteration=model.best_iteration)
mae = mean_absolute_error(y_val, y_pred)
print(f"Validation MAE: {mae:.2f}")

Validation MAE: 1053.08


In [17]:
holiday_mask = X_val['IsHoliday_y'] == 1
holiday_mae = mean_absolute_error(y_val[holiday_mask], y_pred[holiday_mask]) if holiday_mask.sum() > 0 else 0
non_holiday_mae = mean_absolute_error(y_val[~holiday_mask], y_pred[~holiday_mask])

wmae = (5 * holiday_mae * holiday_mask.sum() + non_holiday_mae * (~holiday_mask).sum()) / (5 * holiday_mask.sum() + (~holiday_mask).sum())
print(f"Approximate WMAE: {wmae:.2f}")

Approximate WMAE: 1112.89


In [18]:
test = pd.read_csv("test.csv")
stores = pd.read_csv("stores.csv")
features = pd.read_csv("features.csv")


test = test.merge(stores, on="Store", how="left")
test = test.merge(features, on=["Store", "Date"], how="left")

In [19]:
test['Date'] = pd.to_datetime(test['Date'])
test['Year'] = test['Date'].dt.year
test['Month'] = test['Date'].dt.month
test['Week'] = test['Date'].dt.isocalendar().week
test['Quarter'] = test['Date'].dt.quarter
test['DayOfYear'] = test['Date'].dt.dayofyear

markdown_cols = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
for col in markdown_cols:
    test[col] = test[col].fillna(0)

test['CPI'] = test['CPI'].fillna(df['CPI'].median())  # use training median
test['Unemployment'] = test['Unemployment'].fillna(df['Unemployment'].median())

test['IsHoliday_y'] = test['IsHoliday_y'].astype(int)

test['IsSuperbowl'] = test['Date'].dt.strftime('%Y-%m-%d').isin([
    '2010-02-12', '2011-02-11', '2012-02-10', '2013-02-08'
]).astype(int)

test['IsLaborDay'] = test['Date'].dt.strftime('%Y-%m-%d').isin([
    '2010-09-10', '2011-09-09', '2012-09-07', '2013-09-06'
]).astype(int)

test['IsThanksgiving'] = test['Date'].dt.strftime('%Y-%m-%d').isin([
    '2010-11-26', '2011-11-25', '2012-11-23', '2013-11-29'
]).astype(int)

test['IsChristmas'] = test['Date'].dt.strftime('%Y-%m-%d').isin([
    '2010-12-31', '2011-12-30', '2012-12-28', '2013-12-27'
]).astype(int)


In [20]:
test['Type_encoded'] = le_type.transform(test['Type'])

test['Store_Dept'] = test['Store'] * 100 + test['Dept']

test['Sales_Rolling_Mean_4'] = df.groupby(['Store', 'Dept'])['Weekly_Sales'].transform(lambda x: x.rolling(4, 1).mean()).mean()
test['Sales_Rolling_Std_4'] = df.groupby(['Store', 'Dept'])['Weekly_Sales'].transform(lambda x: x.rolling(4, 1).std()).mean()

test = test.merge(dept_stats, on='Dept', how='left')

test = test.merge(store_stats, on='Store', how='left')


In [21]:
X_submission = test[feature_cols]

y_submission = model.predict(X_submission, num_iteration=model.best_iteration)

In [22]:
submission = pd.DataFrame({
    "Id": test["Store"].astype(str) + "_" +
          test["Dept"].astype(str) + "_" +
          test["Date"].astype(str),
    "Weekly_Sales": y_submission
})

submission.to_csv("submission.csv", index=False)