Dataset Description
You are provided with historical sales data for 1,115 Rossmann stores. The task is to forecast the "Sales" column for the test set. Note that some stores in the dataset were temporarily closed for refurbishment.

Files
train.csv - historical data including Sales
test.csv - historical data excluding Sales
sample_submission.csv - a sample submission file in the correct format
store.csv - supplemental information about the stores
Data fields
Most of the fields are self-explanatory. The following are descriptions for those that aren't.

Id - an Id that represents a (Store, Date) duple within the test set
Store - a unique Id for each store
Sales - the turnover for any given day (this is what you are predicting)
Customers - the number of customers on a given day
Open - an indicator for whether the store was open: 0 = closed, 1 = open
StateHoliday - indicates a state holiday. Normally all stores, with few exceptions, are closed on state holidays. Note that all schools are closed on public holidays and weekends. a = public holiday, b = Easter holiday, c = Christmas, 0 = None
SchoolHoliday - indicates if the (Store, Date) was affected by the closure of public schools
StoreType - differentiates between 4 different store models: a, b, c, d
Assortment - describes an assortment level: a = basic, b = extra, c = extended
CompetitionDistance - distance in meters to the nearest competitor store
CompetitionOpenSince[Month/Year] - gives the approximate year and month of the time the nearest competitor was opened
Promo - indicates whether a store is running a promo on that day
Promo2 - Promo2 is a continuing and consecutive promotion for some stores: 0 = store is not participating, 1 = store is participating
Promo2Since[Year/Week] - describes the year and calendar week when the store started participating in Promo2
PromoInterval - describes the consecutive intervals Promo2 is started, naming the months the promotion is started anew. E.g. "Feb,May,Aug,Nov" means each round starts in February, May, August, November of any given year for that store


In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_log_error
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import lightgbm as lgb
from prophet import Prophet
from statsmodels.tsa.arima.model import ARIMA
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set(style='whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
!pip install prophet==1.1.1

In [None]:
# load the Dataset
store = pd.read_csv('/content/store.csv')
train = pd.read_csv('/content/train.csv')

print('First 5 rows of store dataset:\n')
store.head()

In [None]:
print('First 5 rows of train dataset')
train.head()

In [None]:
# Display basic info
print("Store Dataset Info:")
print(store.info())
print("\nTrain Dataset Info:")
print(train.info())

In [None]:
# Merge on 'Store' column
data = pd.merge(train, store, on='Store', how='left')
print(f"\nMerged Dataset Shape: {data.shape}")
data.head()

## Data Cleaning
><font color=cyan> Handle Missing Values

In [None]:
# Check missing values
print(data.isnull().sum())

In [None]:
# - CompetitionDistance: Impute with median
data['CompetitionDistance'].fillna(data['CompetitionDistance'].median(), inplace=True)

In [None]:
# - CompetitionOpenSinceMonth/Year: Fill with 0 (assuming no competition if missing)
data['CompetitionOpenSinceMonth'].fillna(0, inplace=True)
data['CompetitionOpenSinceYear'].fillna(0, inplace=True)

In [None]:
# - Promo2SinceWeek/Year/Interval: Fill with 0 or 'None' for non-participants
data['Promo2SinceWeek'].fillna(0, inplace=True)
data['Promo2SinceYear'].fillna(0, inplace=True)
data['PromoInterval'].fillna('None', inplace=True)

In [None]:
# Handle closed stores: Filter out rows where Open == 0 (no sales data)
data = data[data['Open'] == 1].drop(columns=['Open'])

In [None]:
# Convert Date to datetime and extract features
data['Date'] = pd.to_datetime(data['Date'])
data['Weekday'] = data['Date'].dt.weekday  # 0=Monday
data['Month'] = data['Date'].dt.month
data['Year'] = data['Date'].dt.year
data['PromoPeriod'] = data['Promo'].astype(int)  # Simple binary for promo periods

In [None]:
# Drop any remaining rows with missing Sales/Customers (minimal, based on info)
data.dropna(subset=['Sales', 'Customers'], inplace=True)
print(f"After Cleaning: {data.shape}")
print(data.isnull().sum())  # Verify no missing values

## Exploratory Data Analysis (EDA)
><font color=cyan>Sales Trends

In [None]:
# - Overall sales over time
plt.figure(figsize=(14, 7))
data.groupby('Date')['Sales'].sum().plot()
plt.title('Total Daily Sales Over Time')
plt.xlabel('Date')
plt.ylabel('Total Sales')
plt.show()

In [None]:
# - Sales by weekday
weekday_sales = data.groupby('Weekday')['Sales'].mean()
plt.figure(figsize=(10, 5))
sns.barplot(x=weekday_sales.index, y=weekday_sales.values, palette='viridis')
plt.title('Average Sales by Weekday')
plt.xlabel('Weekday (0=Monday)')
plt.ylabel('Average Sales')
plt.xticks(range(7), ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
plt.show()

In [None]:
# - Sales by month (seasonal trends)
monthly_sales = data.groupby('Month')['Sales'].mean()
plt.figure(figsize=(10, 5))
sns.lineplot(x=monthly_sales.index, y=monthly_sales.values, marker='o')
plt.title('Average Sales by Month')
plt.xlabel('Month')
plt.ylabel('Average Sales')
plt.xticks(range(1, 13))
plt.show()

><font color=cyan>Impact of Promotions on Sales

In [None]:
# Box plot: Sales with/without promo
plt.figure(figsize=(10, 6))
sns.boxplot(x='Promo', y='Sales', data=data)
plt.title('Sales Distribution by Promotion Status')
plt.xlabel('Promotion (0=No, 1=Yes)')
plt.ylabel('Sales')
plt.show()

><font color=cyan>Correlation Heatmap

In [None]:
# Select numeric columns for correlation
numeric_cols = ['Sales', 'Customers', 'Promo', 'SchoolHoliday', 'CompetitionDistance', 'Weekday', 'Month']
corr_matrix = data[numeric_cols].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap of Key Features')
plt.show()

In [None]:
# Sales per Store Type
store_type_sales = data.groupby('StoreType')['Sales'].mean()
plt.figure(figsize=(8, 5))
sns.barplot(x=store_type_sales.index, y=store_type_sales.values, palette='Set2')
plt.title('Average Sales by Store Type')
plt.xlabel('Store Type')
plt.ylabel('Average Sales')
plt.show()

## Feature Engineering

In [None]:
# Lag variables: Previous day/week/month sales (group by Store)
data = data.sort_values(['Store', 'Date'])
data['Sales_Lag1'] = data.groupby('Store')['Sales'].shift(1)  # Previous day
data['Sales_Lag7'] = data.groupby('Store')['Sales'].shift(7)  # Previous week
data['Sales_Lag30'] = data.groupby('Store')['Sales'].shift(30)  # Previous month

In [None]:
# Rolling averages: 7-day and 30-day
data['Rolling_Sales_7'] = data.groupby('Store')['Sales'].rolling(7).mean().reset_index(0, drop=True)
data['Rolling_Sales_30'] = data.groupby('Store')['Sales'].rolling(30).mean().reset_index(0, drop=True)

In [None]:
# Promo interaction terms: Promo * SchoolHoliday
data['Promo_School_Interact'] = data['Promo'] * data['SchoolHoliday']

In [None]:
# Categorical encoding: StoreType and Assortment
le = LabelEncoder()
data['StoreType_Encoded'] = le.fit_transform(data['StoreType'])
data['Assortment_Encoded'] = le.fit_transform(data['Assortment'])

In [None]:
# Drop rows with NaN from lags/rolling (first few days per store)
data.dropna(inplace=True)
print(f"After Feature Engineering: {data.shape}")
data[['Sales', 'Sales_Lag1', 'Rolling_Sales_7', 'StoreType_Encoded']].head()

## Modelling

In [None]:
# Prepare features for ML (exclude Date and target)
features = ['Customers', 'Promo', 'SchoolHoliday', 'CompetitionDistance', 'Weekday', 'Month',
            'Sales_Lag1', 'Sales_Lag7', 'Sales_Lag30', 'Rolling_Sales_7', 'Rolling_Sales_30',
            'Promo_School_Interact', 'StoreType_Encoded', 'Assortment_Encoded']
X = data[features]
y = data['Sales']

In [None]:
# Split data (time-based: last 20% for test)
split_idx = int(0.8 * len(data))
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

><font color=cyan>Time Series Approach:

In [None]:
# ARIMA and Prophet (on a single store for demo, e.g., Store 1)
store_1 = data[data['Store'] == 1][['Date', 'Sales']].set_index('Date')

In [None]:
# ARIMA
arima_model = ARIMA(store_1['Sales'], order=(5,1,0))  # Example order
arima_fit = arima_model.fit()
arima_pred = arima_fit.forecast(steps=30)
plt.figure(figsize=(12, 6))
plt.plot(store_1.index[-100:], store_1['Sales'][-100:], label='Actual')
plt.plot(pd.date_range(store_1.index[-1], periods=31, freq='D')[1:], arima_pred, label='ARIMA Forecast', color='red')
plt.title('ARIMA Forecast for Store 1')
plt.legend()
plt.show()

In [None]:
!sudo apt-get update -y
!sudo apt-get install python3.10 -y
!sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
!sudo update-alternatives --config python3

In [None]:
!pip uninstall -y prophet fbprophet

In [None]:
!pip install prophet --upgrade --no-cache-dir

In [None]:
from prophet import Prophet
print(Prophet)


In [None]:
# Prophet
prophet_data = store_1.reset_index().rename(columns={'Date': 'ds', 'Sales': 'y'})
prophet_model = Prophet()
prophet_model.fit(prophet_data)
future = prophet_model.make_future_dataframe(periods=30)
forecast = prophet_model.predict(future)
prophet_model.plot(forecast)
plt.title('Prophet Forecast for Store 1')
plt.show()

><font color=cyan>Machine Learning Approach: XGBoost and LightGBM (Global Forecasting)

In [None]:
# XGBoost
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=6)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)

In [None]:
# LightGBM
lgb_model = lgb.LGBMRegressor(n_estimators=100, max_depth=6)
lgb_model.fit(X_train, y_train)
lgb_pred = lgb_model.predict(X_test)

In [None]:
# Evaluation Metrics
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))
def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))
print("XGBoost Evaluation:")
print(f"MAE: {mean_absolute_error(y_test, xgb_pred):.2f}")
print(f"RMSLE: {rmsle(y_test, xgb_pred):.4f}")
print(f"SMAPE: {smape(y_test, xgb_pred):.2f}%")
print("\nLightGBM Evaluation:")
print(f"MAE: {mean_absolute_error(y_test, lgb_pred):.2f}")
print(f"RMSLE: {rmsle(y_test, lgb_pred):.4f}")
print(f"SMAPE: {smape(y_test, lgb_pred):.2f}%")

In [None]:
# Visualization: Actual vs Predicted for XGBoost
plt.figure(figsize=(12, 6))
plt.plot(y_test.values[:100], label='Actual', alpha=0.7)
plt.plot(xgb_pred[:100], label='XGBoost Predicted', alpha=0.7)
plt.title('Actual vs Predicted Sales (XGBoost, First 100 Test Points)')
plt.xlabel('Test Sample Index')
plt.ylabel('Sales')
plt.legend()
plt.show()