In [None]:
# Import all required libraries at the beginning
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# For modeling
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from prophet import Prophet
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Set style for plots
plt.style.use('fivethirtyeight')

In [None]:
# Load the data
train_data = pd.read_csv('train.csv')
stores_data = pd.read_csv('stores.csv')
features_data = pd.read_csv('features.csv')

# Display basic information about the datasets
print("Train Data Shape:", train_data.shape)
print("Stores Data Shape:", stores_data.shape)
print("Features Data Shape:", features_data.shape)

print("\nFirst few rows of Train Data:")
print(train_data.head())

print("\nFirst few rows of Stores Data:")
print(stores_data.head())

print("\nFirst few rows of Features Data:")
print(features_data.head())

In [None]:
# Merge all datasets into one
print("Merging datasets...")

# First merge train data with features
merged_data = pd.merge(train_data, features_data, on=['Store', 'Date'], how='left')

# Then merge with stores data
merged_data = pd.merge(merged_data, stores_data, on=['Store'], how='left')

# Convert Date to datetime
merged_data['Date'] = pd.to_datetime(merged_data['Date'])

# Check for missing values
print("\nMissing values in merged dataset:")
print(merged_data.isnull().sum())

# Display the merged dataset
print("\nMerged dataset shape:", merged_data.shape)
print("\nFirst few rows of merged dataset:")
print(merged_data.head())

In [None]:
# Handle missing values - fill with appropriate values
# For MarkDown columns, fill with 0 assuming no markdown when missing
markdown_cols = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
for col in markdown_cols:
    merged_data[col].fillna(0, inplace=True)

# For CPI and Unemployment, forward fill then backward fill
merged_data['CPI'].fillna(method='ffill', inplace=True)
merged_data['CPI'].fillna(method='bfill', inplace=True)

merged_data['Unemployment'].fillna(method='ffill', inplace=True)
merged_data['Unemployment'].fillna(method='bfill', inplace=True)

# Check if all missing values are handled
print("\nMissing values after handling:")
print(merged_data.isnull().sum())

# Create additional date-related features
merged_data['Year'] = merged_data['Date'].dt.year
merged_data['Month'] = merged_data['Date'].dt.month
merged_data['Week'] = merged_data['Date'].dt.isocalendar().week
merged_data['Day'] = merged_data['Date'].dt.day

# Create holiday flag
merged_data['IsHoliday'] = merged_data['IsHoliday'].astype(int)

print("\nDataset after preprocessing:")
print(merged_data.head())