In [1]:
import pandas as pd
merged_df = pd.read_csv('../data/processed/merged_df.csv',low_memory=False)

In [2]:
#rename 
train_df=merged_df

# Reduce the size

In [3]:
# fill missing values
# Fill missing values in 'column 1' and 'column 2' with 0
train_df['sell_price'].fillna(0, inplace=True)
train_df['revenue'].fillna(0, inplace=True)

import pandas as pd

class MemoryReducer:
    def __init__(self, df):
        self.df = df.copy()

    def reduce_memory_usage(self):
        initial_memory = self.df.memory_usage(deep=True).sum() / (1024 ** 2)  # in megabytes
        print(f"Initial Memory Usage: {initial_memory:.2f} MB")

        for col in self.df.columns:
            col_type = self.df[col].dtype

            if col_type != object:
                if "int" in str(col_type):
                    self.df[col] = pd.to_numeric(self.df[col], downcast="integer")
                elif "float" in str(col_type):
                    self.df[col] = pd.to_numeric(self.df[col], downcast="float")
            else:
                num_unique_values = len(self.df[col].unique())
                num_total_values = len(self.df[col])
                if num_unique_values / num_total_values < 0.5:
                    self.df[col] = self.df[col].astype("category")

        reduced_memory = self.df.memory_usage(deep=True).sum() / (1024 ** 2)  # in megabytes
        print(f"Reduced Memory Usage: {reduced_memory:.2f} MB")
        reduction_percentage = ((initial_memory - reduced_memory) / initial_memory) * 100
        print(f"Memory Reduced by: {reduction_percentage:.2f}%")

        return self.df



reducer = MemoryReducer(train_df)
reduced_df = reducer.reduce_memory_usage()


Initial Memory Usage: 28997.45 MB
Reduced Memory Usage: 1261.80 MB
Memory Reduced by: 95.65%


In [4]:
#rename
df_train=reduced_df

# Load and Explore Dataset

In [5]:
#check the head
df_train.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day,count,date,wm_yr_wk,event_name,event_type,sell_price,revenue,year,month
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,29,0,2011-01-29,11101,0,0,0.0,0.0,2011,1
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,29,0,2011-01-29,11101,0,0,0.0,0.0,2011,1
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,29,0,2011-01-29,11101,0,0,0.0,0.0,2011,1
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,29,0,2011-01-29,11101,0,0,0.0,0.0,2011,1
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,29,0,2011-01-29,11101,0,0,0.0,0.0,2011,1


In [6]:
#check dimension
df_train.shape

(47107050, 16)

In [7]:
# check unique values of year
unique_year_values = df_train['year'].unique()

# This will give you an array of unique values in the 'yar' column
print(unique_year_values)

[2011 2012 2013 2014 2015]


# Feature Engineering

In [8]:
#binary encoding
import category_encoders as ce


# Create a BinaryEncoder instance
encoder = ce.BinaryEncoder(cols=['event_type', 'event_name'])

# Fit and transform the DataFrame to perform binary encoding
df_encoded = encoder.fit_transform(df_train)

In [9]:
from joblib import dump
# save encoded
dump(df_encoded, '../models/df_encoded.joblib')

['../models/df_encoded.joblib']

In [10]:
#rename_df
df_train=df_encoded

In [11]:
# fill missing values
# Fill missing values in 'column 1' and 'column 2' with 0
df_train['sell_price'].fillna(0, inplace=True)
df_train['revenue'].fillna(0, inplace=True)

In [12]:
#see the columns
df_train.columns

Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'day',
       'count', 'date', 'wm_yr_wk', 'event_name_0', 'event_name_1',
       'event_name_2', 'event_name_3', 'event_name_4', 'event_type_0',
       'event_type_1', 'event_type_2', 'sell_price', 'revenue', 'year',
       'month'],
      dtype='object')

In [13]:
# drop 'wm_yr_wk' column
df_train.drop(columns=['wm_yr_wk'], inplace=True)

# drop 'date' column
df_train.drop(columns=['date'], inplace=True)

In [14]:
#get info
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47107050 entries, 0 to 47107049
Data columns (total 20 columns):
 #   Column        Dtype   
---  ------        -----   
 0   id            category
 1   item_id       category
 2   dept_id       category
 3   cat_id        category
 4   store_id      category
 5   state_id      category
 6   day           int8    
 7   count         int16   
 8   event_name_0  int64   
 9   event_name_1  int64   
 10  event_name_2  int64   
 11  event_name_3  int64   
 12  event_name_4  int64   
 13  event_type_0  int64   
 14  event_type_1  int64   
 15  event_type_2  int64   
 16  sell_price    float32 
 17  revenue       float32 
 18  year          int16   
 19  month         int8    
dtypes: category(6), float32(2), int16(2), int64(8), int8(2)
memory usage: 3.8 GB


In [15]:
import numpy as np
#calculating lags feature
lags = [1,7,14,30]
for lag in lags:
   df_train["lag_" + str(lag)] = df_train.groupby("id")["revenue"].shift(lag).astype(np.float16)


In summary, calculating lags is a way to look at past sales data to see if it has any influence on the sales made on the current day. It helps you explore whether there's a relationship between recent sales and today's sales in a store.

In [17]:
#drop id column
df_train.drop(columns=['id'], inplace=True)
df_train.drop(columns=['state_id'], inplace=True)
df_train.drop(columns=['dept_id'], inplace=True)
df_train.drop(columns=['sell_price'], inplace=True)

In [18]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47107050 entries, 0 to 47107049
Data columns (total 20 columns):
 #   Column        Dtype   
---  ------        -----   
 0   item_id       category
 1   cat_id        category
 2   store_id      category
 3   day           int8    
 4   count         int16   
 5   event_name_0  int64   
 6   event_name_1  int64   
 7   event_name_2  int64   
 8   event_name_3  int64   
 9   event_name_4  int64   
 10  event_type_0  int64   
 11  event_type_1  int64   
 12  event_type_2  int64   
 13  revenue       float32 
 14  year          int16   
 15  month         int8    
 16  lag_1         float16 
 17  lag_7         float16 
 18  lag_14        float16 
 19  lag_30        float16 
dtypes: category(3), float16(4), float32(1), int16(2), int64(8), int8(2)
memory usage: 3.8 GB


# Split dataset

In [19]:
# Filter the DataFrame
train_df = df_train[df_train['year'].isin([2014, 2015])]

In [20]:
# indicating x and y columns 
y= train_df["revenue"]
x= train_df.drop(["revenue"],axis=1)

In [21]:
#split dataset
from sklearn.model_selection import train_test_split
#Split the dataset into 2 different sets: data (80%) and test (20%)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)


# ML

In [22]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error

# Initialize LightGBM regressor
lgb_reg = lgb.LGBMRegressor()

param_dist = {
    "n_estimators": [250, 350, 450],
}

# Define K-fold cross-validation (you can use KFold for regression)
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Categorical feature names - specify the names of your categorical columns
categorical_features = ['item_id', 'cat_id', 'store_id']  
  
    
# Use GridSearchCV to find the best hyperparameters
grid = GridSearchCV(lgb_reg, param_dist, cv=cv, refit=True, scoring='neg_mean_absolute_error', verbose=2)

# Add categorical_feature parameter to the fit function
grid.fit(x_train, y_train, categorical_feature=categorical_features)

# Print the best hyperparameters found
print("Best hyperparameters:", grid.best_params_)

# Get the best model
best_model = grid.best_estimator_


# Make predictions on the test set
y_pred = best_model.predict(x_test)

# Calculate RMSE on the test set
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Test RMSE:", rmse)


Fitting 5 folds for each of 3 candidates, totalling 15 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.250003 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4191
[LightGBM] [Info] Number of data points in the train set: 9268960, number of used features: 19
[LightGBM] [Info] Start training from score 3.466460
[CV] END ...................................n_estimators=250; total time=  23.9s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.241902 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4187
[LightGBM] [Info] Number of data points in the train set: 9268960, number of used features: 19
[LightGBM] [Info] Start training from score 3.465685
[CV] END ..................................