# Building a Linear Regression Model for Walmart Sales Forecast

## 1.Import libraries & Data

In [19]:
# Import necessary libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import pickle
import plotly.express as px

# Load the dataset from a CSV file
try:
    df = pd.read_csv('wallmart_final_dataset.csv')
    print("Dataset loaded successfully.")
    print("Dataset shape:", df.shape)
    print("\nFirst 5 rows of the dataset:")
    print(df.head())
except FileNotFoundError:
    print("Error: 'wallmart_final_dataset.csv' not found. Please check the file path.")
    exit()

Dataset loaded successfully.
Dataset shape: (6435, 10)

First 5 rows of the dataset:
   IsHoliday        Date  Store  Weekly_Sales  Temperature  Fuel_Price   
0      False  2010-02-05      1    1643690.90        42.31       2.572  \
1      False  2010-02-05      2    2136989.46        40.19       2.572   
2      False  2010-02-05      3     461622.22        45.71       2.572   
3      False  2010-02-05      4    2135143.87        43.76       2.598   
4      False  2010-02-05      5     317173.10        39.70       2.572   

          CPI  Unemployment Type    Size  
0  211.096358         8.106    A  151315  
1  210.752605         8.324    A  202307  
2  214.424881         7.368    B   37392  
3  126.442065         8.623    A  205863  
4  211.653972         6.566    B   34875  


## 2. Preprocessing & Feature Engineering 

In [20]:

# Convert 'Date' column to datetime objects
df['Date'] = pd.to_datetime(df['Date'])

# Extract year, month, and week of the year, as these might impact sales
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['WeekOfYear'] = df['Date'].dt.isocalendar().week

# Convert boolean 'IsHoliday' to integer (1 for True, 0 for False)
df['IsHoliday'] = df['IsHoliday'].astype(int)

# Convert categorical 'Type' column into numerical using one-hot encoding
df = pd.get_dummies(df, columns=['Type'], drop_first=False)

# Drop the original 'Date' column as we have extracted its features
# Also, drop 'Store' as it's an identifier and not a predictive feature for this general model
df = df.drop(columns=['Date', 'Store'])

print("\nDataset after preprocessing and feature engineering:")
print(df.head())


Dataset after preprocessing and feature engineering:
   IsHoliday  Weekly_Sales  Temperature  Fuel_Price         CPI  Unemployment   
0          0    1643690.90        42.31       2.572  211.096358         8.106  \
1          0    2136989.46        40.19       2.572  210.752605         8.324   
2          0     461622.22        45.71       2.572  214.424881         7.368   
3          0    2135143.87        43.76       2.598  126.442065         8.623   
4          0     317173.10        39.70       2.572  211.653972         6.566   

     Size  Year  Month  WeekOfYear  Type_A  Type_B  Type_C  
0  151315  2010      2           5    True   False   False  
1  202307  2010      2           5    True   False   False  
2   37392  2010      2           5   False    True   False  
3  205863  2010      2           5    True   False   False  
4   34875  2010      2           5   False    True   False  


## 3. Feature Selection using Correlation 

In [21]:
# Calculate the correlation of all features with the target variable 'Weekly_Sales'
correlation_matrix = df.corr()

# Create an interactive heatmap to visualize the correlations
fig = px.imshow(correlation_matrix,
                text_auto='.2f', 
                aspect="auto",
                labels=dict(x="Features", y="Features", color="Correlation"),
                title="<b>Correlation Matrix of Walmart Sales Features</b>",
                color_continuous_scale=px.colors.diverging.RdBu, 
                color_continuous_midpoint=0,
                zmin=-1, zmax=1 
               )

fig.update_layout(title_x=0.5, height=600) 
fig.show() 


target_correlation = correlation_matrix['Weekly_Sales'].abs().sort_values(ascending=False)

print("\nCorrelation of features with Weekly_Sales:")
print(target_correlation)


Correlation of features with Weekly_Sales:
Weekly_Sales    1.000000
Size            0.810468
Type_A          0.571413
Type_C          0.399202
Type_B          0.309248
Unemployment    0.106176
Month           0.076143
WeekOfYear      0.074211
CPI             0.072634
Temperature     0.063810
IsHoliday       0.036891
Year            0.018378
Fuel_Price      0.009464
Name: Weekly_Sales, dtype: float64


### 3.1 Feature Selection

In [22]:
selected_features = [
    'Size',
    'CPI',
    'Unemployment',
    'IsHoliday',
    'Month',
    'Type_B',
    'Type_C'
]
print(f"selected features for the model: {selected_features}")

selected features for the model: ['Size', 'CPI', 'Unemployment', 'IsHoliday', 'Month', 'Type_B', 'Type_C']


## 4. Data Splitting 

In [23]:
X = df[selected_features]
y = df['Weekly_Sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = X_train.astype('float64')
X_test = X_test.astype('float64')
y_train = y_train.astype('float64')
y_test = y_test.astype('float64')


## 5. Data Scaling (Z Normalization)

In [24]:
numerical_cols = ['Size', 'CPI', 'Unemployment', 'Month']
binary_cols = ['IsHoliday', 'Type_B', 'Type_C']

scaler = StandardScaler()

X_train_processed = X_train.copy()
X_test_processed = X_test.copy()

scaler.fit(X_train[numerical_cols])

X_train_processed[numerical_cols] = scaler.transform(X_train[numerical_cols])
X_test_processed[numerical_cols] = scaler.transform(X_test[numerical_cols])

X_train_scaled = X_train_processed
X_test_scaled = X_test_processed
print("\nScaling complete for numerical features.")



Scaling complete for numerical features.


## 6. Model Training 

In [25]:
X_train_with_const = sm.add_constant(X_train_scaled)

# Create and train the Ordinary Least Squares (OLS) model
model = sm.OLS(y_train, X_train_with_const).fit()

print("\nModel training complete.")

# Print the model summary
print(model.summary())


Model training complete.
                            OLS Regression Results                            
Dep. Variable:           Weekly_Sales   R-squared:                       0.673
Model:                            OLS   Adj. R-squared:                  0.673
Method:                 Least Squares   F-statistic:                     1512.
Date:                Sat, 20 Sep 2025   Prob (F-statistic):               0.00
Time:                        20:16:31   Log-Likelihood:                -72613.
No. Observations:                5148   AIC:                         1.452e+05
Df Residuals:                    5140   BIC:                         1.453e+05
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const         9.998e+0

## 7. Model Evaluation 

In [26]:
# Add a constant to the test data for prediction
X_test_with_const = sm.add_constant(X_test_scaled)

# Make predictions on the test set
y_pred = model.predict(X_test_with_const)


print("\nModel Summary:")
print(model.summary())

# Calculate and print key performance metrics
mae = metrics.mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
r2 = metrics.r2_score(y_test, y_pred)

print("\nModel Performance on Test Set:")
print(f"R-squared (R2): {r2:.4f}")
print(f"Mean Absolute Error (MAE): ${mae:,.2f}")
print(f"Root Mean Squared Error (RMSE): ${rmse:,.2f}")


Model Summary:
                            OLS Regression Results                            
Dep. Variable:           Weekly_Sales   R-squared:                       0.673
Model:                            OLS   Adj. R-squared:                  0.673
Method:                 Least Squares   F-statistic:                     1512.
Date:                Sat, 20 Sep 2025   Prob (F-statistic):               0.00
Time:                        20:17:50   Log-Likelihood:                -72613.
No. Observations:                5148   AIC:                         1.452e+05
Df Residuals:                    5140   BIC:                         1.453e+05
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const         9.998e+05   8652.1

## 8. Save the Model and Scaler for Inference 

In [None]:
# Save the trained model to a file using pickle
with open('linear_regression_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save the scaler object to the same file
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# save the list of selected feature names
with open('selected_features.pkl', 'wb') as f:
    pickle.dump(selected_features, f)

print("\nModel, scaler, and feature list have been saved successfully.")


Model, scaler, and feature list have been saved successfully.
Next step: Build the Streamlit dashboard for inference.
