In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import joblib
from datetime import datetime

In [None]:
# Step 1: Preprocessing
data_path = r'C:\Users\hp\Desktop\KAIM\Week 4\rossmann-store-sales\\'

In [None]:
# Load datasets
train = pd.read_csv(data_path + 'train.csv', dtype={'StateHoliday': str}, low_memory=False)
store = pd.read_csv(data_path + 'store.csv')
test = pd.read_csv(data_path + 'test.csv', dtype={'StateHoliday': str}, low_memory=False)

In [None]:
# Merge store data with train and test datasets
train = pd.merge(train, store, on='Store', how='left')
test = pd.merge(test, store, on='Store', how='left')

In [None]:
# Fill missing values
train['CompetitionDistance'] = train['CompetitionDistance'].fillna(train['CompetitionDistance'].median())
train['Promo2SinceYear'] = train['Promo2SinceYear'].fillna(0)
train['Promo2SinceWeek'] = train['Promo2SinceWeek'].fillna(0)
train['PromoInterval'] = train['PromoInterval'].fillna('None')

test['CompetitionDistance'] = test['CompetitionDistance'].fillna(test['CompetitionDistance'].median())
test['Promo2SinceYear'] = test['Promo2SinceYear'].fillna(0)
test['Promo2SinceWeek'] = test['Promo2SinceWeek'].fillna(0)
test['PromoInterval'] = test['PromoInterval'].fillna('None')

In [None]:
# Step 2: Feature Engineering - Extracting date-related features from 'Date' column
# Convert 'Date' to datetime format and extract features
train['Date'] = pd.to_datetime(train['Date'])
train['Year'] = train['Date'].dt.year
train['Month'] = train['Date'].dt.month
train['Day'] = train['Date'].dt.day
train['DayOfWeek'] = train['Date'].dt.dayofweek
train['Weekday'] = train['DayOfWeek'].apply(lambda x: 1 if x < 5 else 0)

test['Date'] = pd.to_datetime(test['Date'])
test['Year'] = test['Date'].dt.year
test['Month'] = test['Date'].dt.month
test['Day'] = test['Date'].dt.day
test['DayOfWeek'] = test['Date'].dt.dayofweek
test['Weekday'] = test['DayOfWeek'].apply(lambda x: 1 if x < 5 else 0)

In [None]:
# Step 3: Label encoding for categorical variables
# Combine unique labels from both train and test sets for encoding
label_encoder = LabelEncoder()

In [None]:
# Combine train and test 'StateHoliday' values to fit the encoder
all_state_holidays = np.concatenate([train['StateHoliday'], test['StateHoliday']])
label_encoder.fit(all_state_holidays)

In [None]:
# Apply the fitted encoder to both datasets
train['StateHoliday'] = label_encoder.transform(train['StateHoliday'])
test['StateHoliday'] = label_encoder.transform(test['StateHoliday'])

In [None]:
# Encoding other categorical variables in both train and test sets
train['StoreType'] = label_encoder.fit_transform(train['StoreType'])
test['StoreType'] = label_encoder.transform(test['StoreType'])

train['Assortment'] = label_encoder.fit_transform(train['Assortment'])
test['Assortment'] = label_encoder.transform(test['Assortment'])

In [None]:
# Step 4: Feature Selection
features = ['Store', 'Promo', 'StateHoliday', 'SchoolHoliday', 'DayOfWeek', 'Month', 'CompetitionDistance', 'StoreType', 'Assortment']
X = train[features]
y = train['Sales']

In [None]:
# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Convert data to NumPy arrays to avoid feature name mismatch warnings
X_train_np = X_train.to_numpy()
X_val_np = X_val.to_numpy()

In [None]:
# Step 5: Create Pipeline and Model Training
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Scaling step
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42))  # Random Forest Model
])

In [None]:
# Fit the pipeline to the training data (NumPy arrays)
pipeline.fit(X_train_np, y_train)

In [None]:
# Make predictions (using NumPy array for X_val)
y_pred = pipeline.predict(X_val_np)

In [None]:
# Evaluate the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_val, y_pred)
print(f"Mean Squared Error: {mse}")

In [None]:
# Step 6: Post-Prediction Analysis
# Feature importance
importances = pipeline.named_steps['rf'].feature_importances_

In [None]:
# Display feature importance results in a readable format
print("\n--- Feature Importance ---")
for i, feature in enumerate(features):
    print(f"Feature: {feature}, Importance: {importances[i]:.4f}")

In [None]:
# Confidence intervals estimation
predictions_std = np.std([tree.predict(X_val_np) for tree in pipeline.named_steps['rf'].estimators_], axis=0)
confidence_intervals = 1.96 * predictions_std

In [None]:
# Display predictions with confidence intervals
print("\n--- Predictions with Confidence Intervals ---")
for i in range(5):
    print(f"Prediction: {y_pred[i]:.2f}, Confidence Interval: +/- {confidence_intervals[i]:.2f}")