In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt


In [2]:

# Load data
df = pd.read_csv('Walmart.csv')
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
print('Shape:', df.shape)
df.head()


Shape: (6435, 8)


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,2010-05-02,1643690.9,0,42.31,2.572,211.096358,8.106
1,1,2010-12-02,1641957.44,1,38.51,2.548,211.24217,8.106
2,1,NaT,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,NaT,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,2010-05-03,1554806.68,0,46.5,2.625,211.350143,8.106


In [6]:

# Basic cleaning and features
df.sort_values(['Date'], inplace=True)

# Time-based features
df['Year'] = df['Date'].dt.year
df['DayOfWeek'] = df['Date'].dt.dayofweek

# If there's no "IsHoliday" column, create it as 0
if 'IsHoliday' in df.columns:
    df['IsHoliday'] = df['IsHoliday'].astype(int)
elif 'Holiday_Flag' in df.columns:   # your file has Holiday_Flag
    df['IsHoliday'] = df['Holiday_Flag'].astype(int)
else:
    df['IsHoliday'] = 0

# Lag and rolling mean based only on Date ordering
df['lag_1'] = df['Weekly_Sales'].shift(1)
df['rolling_mean_4'] = df['Weekly_Sales'].shift(1).rolling(window=4, min_periods=1).mean()

# Drop rows with missing lag/rolling values
df = df.dropna(subset=['lag_1','rolling_mean_4'])

print('After features shape:', df.shape)
df.head()


After features shape: (6432, 14)


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,DayOfWeek,IsHoliday,lag_1,rolling_mean_4
3466,25,2010-01-10,658640.14,0,57.56,2.707,204.885097,7.484,2010.0,1.0,6.0,0,1827440.43,1827440.0
5039,36,2010-01-10,422169.47,0,74.66,2.567,210.440443,8.476,2010.0,1.0,6.0,0,658640.14,1243040.0
1178,9,2010-01-10,495692.19,0,69.08,2.603,215.214134,6.56,2010.0,1.0,6.0,0,422169.47,969416.7
34,1,2010-01-10,1453329.5,0,71.89,2.603,211.671989,7.838,2010.0,1.0,6.0,0,495692.19,850985.6
3609,26,2010-01-10,923221.52,0,57.8,2.717,132.7568,8.149,2010.0,1.0,6.0,0,1453329.5,757457.8


In [7]:

# Train/test split (time-based)
unique_dates = np.sort(df['Date'].unique())
split_date = unique_dates[int(len(unique_dates)*0.9)]

train_df = df[df['Date'] <= split_date].copy()
test_df = df[df['Date'] > split_date].copy()

# Features that actually exist in your dataset
features = ['IsHoliday', 'Year', 'Month', 'DayOfWeek', 'lag_1', 'rolling_mean_4']
target = 'Weekly_Sales'

X_train = train_df[features]; y_train = train_df[target]
X_test = test_df[features]; y_test = test_df[target]

# Baseline linear regression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_test)

# Random forest
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

# Evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
def evaluate(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return {'mse': mse, 'rmse': rmse, 'mae': mae, 'r2': r2}

print('LinearRegression ->', evaluate(y_test, lr_preds))
print('RandomForest ->', evaluate(y_test, rf_preds))



LinearRegression -> {'mse': 307700111248.0364, 'rmse': 554707.2302107089, 'mae': 473800.8881978843, 'r2': -0.007200301241813811}
RandomForest -> {'mse': 345919687808.39087, 'rmse': 588149.3754212367, 'mae': 489159.2915455556, 'r2': -0.132305127719738}
