# Prediction of store Sales

In [2]:
# import basic Libraries
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.preprocessing import StandardScaler, LabelEncoder
import warnings
warnings.filterwarnings('ignore')

In [3]:
# get working directory
import os, sys
sys.path.append(os.path.abspath('..'))

In [4]:
from scripts.data_summary import summary_statistics

## Dataset

In [5]:
# load preprocessed data
data = pd.read_csv('../data/processed_data.csv', parse_dates=['Date'])

**Dataset overview**

The dataset contains 13 columns and 1017209 rows. Here's a summary of the key columns:

* **Store**: Store ID.
* **DayOfWeek:** Day of the week (1 = Monday, ..., 7 = Sunday).
* **Date:** Date of the record.
* **Sales:** Sales amount (target variable for prediction).
* **Customers:** Number of customers visiting the store.
* **Open:** Whether the store was open (1 = Open, 0 = Closed).
* **Promo:** Whether the store was running a promotion.
* **StateHoliday:** Indicates if the day was a state holiday.( a = public holiday, b = Easter holiday, c = Christmas, 0 = None)
* **SchoolHoliday:** Indicates if the day was a school holiday.
* **StoreType:** Type of store.
* **Assortment:** Level of assortment (a = basic, b = extra, c = extended.).
* **CompetitionDistance:** Distance to the nearest competitor.
* **Promo2:** Continuation of a promotion.

In [6]:
data.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,Promo2
0,1,2,2013-01-01,0,0,0,0,a,1,c,a,1270.0,0
1,1,3,2013-01-02,5530,668,1,0,0,1,c,a,1270.0,0
2,1,4,2013-01-03,4327,578,1,0,0,1,c,a,1270.0,0
3,1,5,2013-01-04,4486,619,1,0,0,1,c,a,1270.0,0
4,1,6,2013-01-05,4997,635,1,0,0,1,c,a,1270.0,0


In [7]:
# Data Summary overview
summary_statistics(data)

Shape of the Data
(1017209, 13)


data Summary
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 13 columns):
 #   Column               Non-Null Count    Dtype         
---  ------               --------------    -----         
 0   Store                1017209 non-null  int64         
 1   DayOfWeek            1017209 non-null  int64         
 2   Date                 1017209 non-null  datetime64[ns]
 3   Sales                1017209 non-null  int64         
 4   Customers            1017209 non-null  int64         
 5   Open                 1017209 non-null  int64         
 6   Promo                1017209 non-null  int64         
 7   StateHoliday         1017209 non-null  object        
 8   SchoolHoliday        1017209 non-null  int64         
 9   StoreType            1017209 non-null  object        
 10  Assortment           1017209 non-null  object        
 11  CompetitionDistance  1017209 non-null  float64       
 12  Promo2   

## Preprocessing

* Handle the Date column to extract features like weekdays, weekends, days to holidays, etc.

In [8]:
# Convert 'Date' column to datetime format
data['Date'] = pd.to_datetime(data['Date'])

# Extract weekday; monday = 0... sunday = 6
data['WeekDay'] = data['Date'].dt.weekday

# extract weekend from weekday; 1 if saturday or sunday
data['WeekEnd'] = data['WeekDay'].isin([5,6]).astype(int)

# extract Month
data['Month'] = data['Date'].dt.month

# Extract Year 
data['Year'] = data['Date'].dt.year

# Extract Day
data['Day'] = data['Date'].dt.day

# Extract Beginning of the month, mid-month, and end of the month
data['IsBeginning'] = (data['Day'] <= 10).astype(int)
data['IsMid'] = ((data['Day']>10) & (data['Day']<=20)).astype(int)
data['IsEnd'] = (data['Day']>20).astype(int)

In [9]:
# Drop Date column
data.drop('Date', axis =1, inplace=True)

* Convert object columns in to Categorical then to Numeric.

In [10]:
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype('category')

In [11]:
# Encode Categorical column
label_encoder = LabelEncoder()
for col in data.select_dtypes(include=['category']).columns:
    data[col] = label_encoder.fit_transform(data[col])

* Scale numerical columns

In [12]:
scaler = StandardScaler()
scaled_col = ['Customers', 'CompetitionDistance']
data[scaled_col] = scaler.fit_transform(data[scaled_col])

Check missing Value

In [13]:
data.isnull().sum()

Store                  0
DayOfWeek              0
Sales                  0
Customers              0
Open                   0
Promo                  0
StateHoliday           0
SchoolHoliday          0
StoreType              0
Assortment             0
CompetitionDistance    0
Promo2                 0
WeekDay                0
WeekEnd                0
Month                  0
Year                   0
Day                    0
IsBeginning            0
IsMid                  0
IsEnd                  0
dtype: int64

* Check processed data

In [14]:
data.head().T

Unnamed: 0,0,1,2,3,4
Store,1.0,1.0,1.0,1.0,1.0
DayOfWeek,2.0,3.0,4.0,5.0,6.0
Sales,0.0,5530.0,4327.0,4486.0,4997.0
Customers,-1.36333,0.07505,-0.118744,-0.03046,0.003992
Open,0.0,1.0,1.0,1.0,1.0
Promo,0.0,0.0,0.0,0.0,0.0
StateHoliday,1.0,0.0,0.0,0.0,0.0
SchoolHoliday,1.0,1.0,1.0,1.0,1.0
StoreType,2.0,2.0,2.0,2.0,2.0
Assortment,0.0,0.0,0.0,0.0,0.0


## Build Model

* Proceed to build machine learning models using sklearn pipelines with the preprocessed dataset. I'll create a pipeline with a `RandomForestRegressor`

In [15]:
# import basic libraries for modeling
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

### **Train a model on Validation Set**

In [17]:
# Separate feature and target
X = data.drop('Sales', axis=1)
y = data['Sales']

# Split the data into Train and Validation Set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Create Pipline with RandomForestRegressor
pipline = Pipeline([('model', RandomForestRegressor(random_state=42))])

In [18]:
# Train The pipline
pipline.fit(X_train, y_train)

**Make a prediction for Validation set**

In [19]:
y_val_pred = pipline.predict(X_val)

**Evaluate the predicted validation value**

In [20]:
# Calculate evaluation metrics
val_evaluation = {
    'Metric': ['Mean Squared Error', 'Mean Absolute Error', 'R2 Score'],
    'Value': [
        mean_squared_error(y_val, y_val_pred),
        mean_absolute_error(y_val, y_val_pred),
        r2_score(y_val, y_val_pred)
    ]
}

evaluation_df = pd.DataFrame(val_evaluation)
evaluation_df


Unnamed: 0,Metric,Value
0,Mean Squared Error,196838.758228
1,Mean Absolute Error,270.808695
2,R2 Score,0.986784


**Interpretation:**
* The R² Score of 0.98678 indicates that the model explains 98.4% of the variance in store sales, which is excellent.
* The MAE of 270.8 suggests that, on average, the predictions are off by about 223 sales units.

### **Train model on testset**

In [21]:
# Load processed test Data
test_data = pd.read_csv('../data/processed_test_data.csv', parse_dates=['Date'])

In [22]:
test_data.head()

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,Promo2
0,40233,1,6,2015-08-01,1.0,0,0,1,c,a,1270.0,0
1,39377,1,7,2015-08-02,0.0,0,0,1,c,a,1270.0,0
2,38521,1,1,2015-08-03,1.0,1,0,1,c,a,1270.0,0
3,37665,1,2,2015-08-04,1.0,1,0,1,c,a,1270.0,0
4,36809,1,3,2015-08-05,1.0,1,0,1,c,a,1270.0,0


**Note:** There is additional column 'ID' and missing column 'Customers' in test_data
* drop Id column 
* add Customers column in default value 0

In [37]:
test_processed_data = test_data.drop('Id', axis = 1)
test_processed_data.insert(2, 'Customers', 0)

In [39]:
def data_preoricessing(data):
    # Convert 'Date' column to datetime format
    data['Date'] = pd.to_datetime(data['Date'])

    # Extract weekday; monday = 0... sunday = 6
    data['WeekDay'] = data['Date'].dt.weekday

    # extract weekend from weekday; 1 if saturday or sunday
    data['WeekEnd'] = data['WeekDay'].isin([5,6]).astype(int)

    # extract Month
    data['Month'] = data['Date'].dt.month

    # Extract Year 
    data['Year'] = data['Date'].dt.year

    # Extract Day
    data['Day'] = data['Date'].dt.day

    # Extract Beginning of the month, mid-month, and end of the month
    data['IsBeginning'] = (data['Day'] <= 10).astype(int)
    data['IsMid'] = ((data['Day']>10) & (data['Day']<=20)).astype(int)
    data['IsEnd'] = (data['Day']>20).astype(int)

    # Drop Date column
    data.drop('Date', axis =1, inplace=True)

    for col in data.select_dtypes(include=['object']).columns:
        data[col] = data[col].astype('category')

    # Encode Categorical column
    label_encoder = LabelEncoder()
    for col in data.select_dtypes(include=['category']).columns:
        data[col] = label_encoder.fit_transform(data[col])

    scaler = StandardScaler()
    scaled_col = ['CompetitionDistance']
    data[scaled_col] = scaler.fit_transform(data[scaled_col])


data_preoricessing(test_processed_data)

In [41]:
# make prediction on test set
test_data_pred = pipline.predict(test_processed_data)

In [44]:
test_data_pred

array([5234.81, 5207.98, 5963.32, ..., 9426.86, 9245.44, 9285.29],
      shape=(41088,))

# Save result

In [45]:
result = pd.DataFrame()
result['Id'] = test_data['Id']
result['Sales'] = test_data_pred

In [47]:
result.to_csv('../data/Predicted_sales.csv', index=False)