In [1]:
!pip install opendatasets --upgrade --quiet
!pip install --upgrade scikit-learn


Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (17 kB)
Downloading scikit_learn-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m87.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.1
    Uninstalling scikit-learn-1.6.1:
      Successfully uninstalled scikit-learn-1.6.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sklearn-compat 0.1.3 requires scikit-learn<1.7,>=1.2, but you have scikit-learn 1.7.0 which is incompatible.[0m[31m
[0mSuccessfully installed scikit-learn-1.7.0


In [2]:
import opendatasets as od
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

In [3]:
import os

In [4]:
dataset_url = 'https://www.kaggle.com/datasets/pratyushakar/rossmann-store-sales'

In [None]:
od.download(dataset_url)

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username:

In [None]:
data_dir_name = './rossmann-store-sales'

In [None]:
os.listdir(data_dir_name)
data_csv = data_dir_name + '/train.csv'
train_data = pd.read_csv(data_csv)

In [None]:
train_data

In [None]:
data_csv = data_dir_name + '/store.csv'
store_data = pd.read_csv(data_csv)

In [None]:
data_csv = data_dir_name + '/test.csv'
test_data = pd.read_csv(data_csv)

In [None]:
train_data.columns

In [None]:
store_data.columns

In [None]:
train_data.info()

In [None]:
train_data.describe()

In [None]:
merge_df = train_data.merge(store_data,how='left', on='Store')

In [None]:
merge_df

In [None]:
merge_df.info()

In [None]:
merge_df.describe()

In [None]:
merge_df.columns

In [None]:
merge_test_df = test_data.merge(store_data,how='left', on='Store')

In [None]:
merge_test_df.info()

In [None]:
round(merge_df.describe().T,2)

In [None]:
merge_df.duplicated().sum()

In [None]:
merge_df.isnull().sum()

In [None]:
merge_df['Date'] = pd.to_datetime(merge_df.Date)

In [None]:
merge_test_df['Date'] = pd.to_datetime(merge_test_df.Date)

In [None]:
merge_df.Date.min(), merge_df.Date.max()

In [None]:
merge_test_df.Date.min(), merge_test_df.Date.max()

# Exploratory Data Analysis and Visualization
Objectives of exploratory data analysis :


*   Study the distributions of individual columns(uniform,normal,exponential)
*   Detects anomalies or errors in the data (e.g. missing/incorrect values)
*   Study the relationship of the target columns(linear,non-linear etc)
*   Gather insights about the problem and the dataset

*   Cope up with ideas for preprocessing and feature engineering








In [None]:
sns.histplot(data=merge_df, x='Sales')

In [None]:
merge_df.Open.value_counts()

In [None]:
merge_df.Open.value_counts()

To make our modeling simple ,Lets simply exclude the dates when the store was closed

In [None]:
merge_df = merge_df[merge_df.Open==1].copy()

In [None]:
sns.histplot(data=merge_df, x='Sales')

Lets explore some other columns

In [None]:
plt.figure(figsize=(18,8))
temp_df = merge_df.sample(40000)
sns.scatterplot(x=temp_df.Sales,y=temp_df.Customers,hue=temp_df.Date.dt.year, alpha=0.8)
plt.title('Sales vs Customers')
plt.show()

In [None]:
plt.figure(figsize=(18,8))
temp_df = merge_df.sample(10000)
sns.scatterplot(x=temp_df.Store,y=temp_df.Sales,hue=temp_df.Date.dt.year, alpha=0.8)
plt.title('Store vs Sales')
plt.show()

In [None]:
sns.barplot(data=merge_df,x='DayOfWeek', y='Sales')

In [None]:
sns.barplot(data=merge_df,x='Promo', y='Sales');

In [None]:
merge_df

In [None]:
merge_df.corr()['Sales'].sort_values(ascending=False)

In [None]:
numeric_df = merge_df.select_dtypes(include=np.number)
correlation_matrix = numeric_df.corr()['Sales'].sort_values(ascending=False)
print(correlation_matrix)

# Feature engineering
Feature engineering is the process of creating new features(columns) by transforming / combining existing features or by incorporating data from extrnal sources.

For example, here features that can be extracted from the "Date" column:


1.   Day of Week
2.   Day oe Month
3.   Month
4.   Year
5.   weekend
6.   Month/Quarted End


In [None]:
merge_df['Day'] = merge_df.Date.dt.day
merge_df['Month'] = merge_df.Date.dt.month
merge_df['Year'] = merge_df.Date.dt.year

In [None]:
merge_test_df['Day'] = merge_test_df.Date.dt.day
merge_test_df['Month'] = merge_test_df.Date.dt.month
merge_test_df['Year'] = merge_test_df.Date.dt.year

In [None]:
sns.barplot(data=merge_df,x='Year', y='Sales');

In [None]:
sns.barplot(data=merge_df,x='Month', y='Sales')

In [None]:
sns.barplot(data=merge_df, x='Day', y='Sales')

In [None]:
merge_df

# Train/Test/Validation Splite

In [None]:
merge_test_df

In [None]:
len(merge_df)

In [None]:
train_size = int(.75* len(merge_df))
train_size

In [None]:
sorted_df = merge_df.sort_values('Date')
train_df, val_df = sorted_df[:train_size], sorted_df[train_size:]

In [None]:
len(train_df), len(val_df)

In [None]:
train_df

In [None]:
train_df.Date.min(),train_df.Date.max()

In [None]:
val_df.Date.min(), val_df.Date.max()

In [None]:
val_df

In [None]:
train_df.columns

# Input and Target columns

In [None]:
input_cols = ['Store','DayOfWeek','Promo','Day','Month','Year',
              'StateHoliday','StoreType','Assortment']
output_col = ['Sales']

In [None]:
train_df[input_cols]

In [None]:
merge_df[input_cols].nunique()

In [None]:
train_input = train_df[input_cols].copy()
train_target = train_df[output_col].copy()

In [None]:
val_input = val_df[input_cols].copy()
val_target = val_df[output_col].copy()

In [None]:
test_input = merge_test_df[input_cols].copy()

In [None]:
train_input.info()

Note that some columns can be treated as both numeric and categorical,and its upto you to decide how you want to deal with them

In [None]:
numeric_cols = ['Store','Day','Month','Year']
categorical_cols = ['DayOfWeek','Promo','StateHoliday','StoreType','Assortment']

# Imputation, Scaling and encode

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

In [None]:
imputer = SimpleImputer(strategy='mean').fit(train_input[numeric_cols])

In [None]:
train_input[numeric_cols] = imputer.transform(train_input[numeric_cols])
val_input[numeric_cols] = imputer.transform(val_input[numeric_cols])
test_input[numeric_cols] = imputer.transform(test_input[numeric_cols])

In [None]:
scaler = MinMaxScaler().fit(train_input[numeric_cols])

In [None]:
train_input[numeric_cols] = scaler.transform(train_input[numeric_cols])
val_input[numeric_cols] = scaler.transform(val_input[numeric_cols])
test_input[numeric_cols] = scaler.transform(test_input[numeric_cols])

In [None]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore').fit(train_input[categorical_cols]) # Replace sparse with sparse_output
encoded_list = list(encoder.get_feature_names_out(categorical_cols))

In [None]:
# Convert categorical columns to string type before fitting the encoder
for col in categorical_cols:
    train_input[col] = train_input[col].astype(str)
    val_input[col] = val_input[col].astype(str)
    test_input[col] = test_input[col].astype(str)

# Now fit the encoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore').fit(train_input[categorical_cols]) # Replace sparse with sparse_output
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))

In [None]:
train_input[encoded_cols] = encoder.transform(train_input[categorical_cols])
val_input[encoded_cols] = encoder.transform(val_input[categorical_cols])
test_input[encoded_cols] = encoder.transform(test_input[categorical_cols])

In [None]:
train_input[numeric_cols + encoded_cols]

In [None]:
x_train = train_input[numeric_cols + encoded_cols].copy()
x_val = val_input[numeric_cols + encoded_cols].copy()
x_test = test_input[numeric_cols + encoded_cols].copy()

In [None]:
x_train

# Create quick and easy baseline models to benchmark future models

A quick baseline model helps establish the minimum score any ML model you train should achieve

# Fixed/Random Guess

Lets define a model that always a returns the mean value of Sales as the prediction.

In [None]:
train_target.mean()

In [None]:
def return_mean (inputs):
  return np.full(len(inputs),merge_df.Sales.mean())

In [None]:
train_preds = return_mean(x_train)


In [None]:
train_preds

Lets evaluate this to using the RMSE score

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
mse = mean_squared_error(train_preds,train_target)
rmse = np.sqrt(mse)
rmse

In [None]:
mse = mean_squared_error(return_mean(x_val),val_target)
rmse = np.sqrt(mse)
rmse

the model is off by $3000 on average

lets try another model,which makes a random guess between the lowest and highest sale.

In [None]:
def guess_random(inputs):
  # Select only the 'Sales' column for min/max calculation
  lo, hi = merge_df['Sales'].min(), merge_df['Sales'].max()
  return np.random.random(len(inputs)) * (hi - lo) + lo

In [None]:
train_preds = guess_random(x_train)
train_preds

In [None]:
mse = mean_squared_error(train_preds,train_target)
rmse = np.sqrt(mse)
rmse

In [None]:
mse = mean_squared_error(guess_random(x_val),val_target)
rmse = np.sqrt(mse)
rmse

# Baseline ML model
lets train a simple LinearRegrassion model ,with no customization

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
linreg = LinearRegression()
linreg.fit(x_train,train_target)

In [None]:
train_preds = linreg.predict(x_train)
train_preds

In [None]:
mse = mean_squared_error(train_preds,train_target)
rmse = np.sqrt(mse)
rmse

In [None]:
val_preds = linreg.predict(x_val)
val_preds

In [None]:
mse = mean_squared_error(val_preds,val_target)
rmse = np.sqrt(mse)
rmse

Note that a simple linear regression model is not much better than our fixed baseline model which always predicts the mean.
Based on the avobe baselines,we now know that any model we train shoild have ideally have a RMSE score lower than $2800.This baseline can also be conveyed to other stakeholders to get a sense of whether the range of loss make sense.

Lets define a function try_model, which takes amodel, then performs training and evaluation.

In [None]:
def try_model(model):
  #fit the model
  model.fit(x_train,train_target)

  # Geenerate prediction
  train_preds = model.predict(x_train)
  val_preds = model.predict(x_val)
  # compute RMSE
  mse = mean_squared_error(train_preds,train_target)
  train_rmse = np.sqrt(mse)
  mse = mean_squared_error(val_preds,val_target)
  val_rmse = np.sqrt(mse)
  return train_rmse, val_rmse



# Linear Model

In [None]:
from sklearn.linear_model import LinearRegression,Ridge, Lasso, SGDRegressor,ElasticNet

In [None]:
try_model(LinearRegression())


In [None]:
try_model(Ridge())


In [None]:
try_model(Lasso())

In [None]:
try_model(SGDRegressor())

In [None]:
try_model(ElasticNet())

# Tree Based Models

In [None]:
from sklearn.tree import DecisionTreeRegressor, plot_tree

In [None]:
tree = DecisionTreeRegressor(max_leaf_nodes=32, random_state=42)
try_model(tree)

Seems like the decision tree performs much better than linear models

In [None]:
plt.figure(figsize=(40,20))
plot_tree(tree, max_depth=3,filled=True,feature_names=numeric_cols+encoded_cols);

Lets try Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
forest = RandomForestRegressor(n_jobs=-1,random_state=42,n_estimators=100)
try_model(forest)