<a href="https://colab.research.google.com/github/GochiStuff/walmart-sales-predictor/blob/main/walmart_sales_predict_ai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import kagglehub
from tqdm import tqdm

# --- Download Dataset ---
print("Downloading dataset...")
path = kagglehub.dataset_download("mikhail1681/walmart-sales")
print("Dataset downloaded to:", path)

# --- Locate CSV Files ---
files = list(Path(path).rglob("*.csv"))
print("\nCSV files found:")
for file in tqdm(files, desc="Listing CSV files"):
    print(file)

# --- Load Data ---
data_file = files[0]
print("\nLoading data from:", data_file)
data = pd.read_csv(data_file)
print("Data loaded. Columns in dataset:", data.columns.tolist())

# --- Data Preprocessing ---
print("\nPreprocessing data...")

# Process Date column: extract date features
if 'Date' in data.columns:
    data['Date'] = pd.to_datetime(data['Date'], dayfirst=True)
    data['Year'] = data['Date'].dt.year
    data['Month'] = data['Date'].dt.month
    data['Day'] = data['Date'].dt.day
    data['WeekOfYear'] = data['Date'].dt.isocalendar().week
    data.drop('Date', axis=1, inplace=True)

# Process IsHoliday column: convert boolean/string to integer
if 'IsHoliday' in data.columns:
    data['IsHoliday'] = data['IsHoliday'].astype(int)

# Assume the target column is Weekly_Sales; check that it exists.
target = 'Weekly_Sales'
if target not in data.columns:
    raise ValueError("Target column 'Weekly_Sales' not found in dataset.")

# Identify categorical features (if they exist)
categorical_cols = []
if 'Store' in data.columns:
    categorical_cols.append('Store')
if 'Dept' in data.columns:
    categorical_cols.append('Dept')

# Identify numeric features (we expect our date features and IsHoliday)
numeric_cols = []
for col in ['Year', 'Month', 'Day', 'WeekOfYear', 'IsHoliday']:
    if col in data.columns:
        numeric_cols.append(col)

# Define features and target
X = data.drop(target, axis=1)
y = data[target]

# --- Split Data ---
print("\nSplitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Data split complete: {} training samples and {} testing samples.".format(len(X_train), len(X_test)))

# --- Build Preprocessing Pipeline ---
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'
)

# --- Build the Overall Pipeline ---
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', SGDRegressor(random_state=42, max_iter=1500, tol=None))
])

# --- Hyperparameter Tuning ---
# We'll tune some hyperparameters of SGDRegressor.
param_grid = {
    'regressor__eta0': [0.0001, 0.001, 0.01],
    'regressor__learning_rate': ['constant', 'optimal', 'invscaling'],
    'regressor__penalty': ['l2', 'l1', 'elasticnet']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='r2', verbose=1, n_jobs=-1)
print("\nStarting grid search over hyperparameters...")
grid_search.fit(X_train, y_train)
print("Grid search complete.")

best_model = grid_search.best_estimator_
print("\nBest Parameters Found:", grid_search.best_params_)

# --- Evaluate the Best Model ---
print("\nEvaluating the best model on test data...")
y_pred = best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print("\nBest Model Evaluation:")
print("Test Accuracy (R²):", r2)
print("Mean Squared Error:", mse)


Downloading dataset...
Dataset downloaded to: /root/.cache/kagglehub/datasets/mikhail1681/walmart-sales/versions/2

CSV files found:


Listing CSV files: 100%|██████████| 1/1 [00:00<00:00, 7738.57it/s]

/root/.cache/kagglehub/datasets/mikhail1681/walmart-sales/versions/2/Walmart_Sales.csv

Loading data from: /root/.cache/kagglehub/datasets/mikhail1681/walmart-sales/versions/2/Walmart_Sales.csv
Data loaded. Columns in dataset: ['Store', 'Date', 'Weekly_Sales', 'Holiday_Flag', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment']

Preprocessing data...

Splitting data into training and testing sets...
Data split complete: 5148 training samples and 1287 testing samples.

Starting grid search over hyperparameters...
Fitting 3 folds for each of 27 candidates, totalling 81 fits





Grid search complete.

Best Parameters Found: {'regressor__eta0': 0.0001, 'regressor__learning_rate': 'invscaling', 'regressor__penalty': 'l1'}

Evaluating the best model on test data...

Best Model Evaluation:
Test Accuracy (R²): 0.525284952182116
Mean Squared Error: 152931619554.85226
