<a href="https://colab.research.google.com/github/IshuDhana/Mini_project_1/blob/main/mini_machine_learning_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports details

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import (
    train_test_split, TimeSeriesSplit, KFold, cross_val_score
)
from sklearn.metrics import (
    mean_squared_error, r2_score, mean_absolute_error
)


# Load Training Data

In [None]:
df_file = pd.read_csv("training.csv")

print("=== Training Data Preview ===")
print(df_file.head(), "\n")

print("=== Training Data Info ===")
print(df_file.info(), "\n")

print("=== Summary Statistics ===")
print(df_file.describe(), "\n")

# Data Cleaning & Preprocessing

In [None]:
# Drop junk columns
df_file = df_file.loc[:, ~df_file.columns.str.contains(r'^Unnamed')]

# Parse date
df_file['date'] = pd.to_datetime(df_file['date'], errors='coerce')
df_file.dropna(subset=["date"], inplace=True)

# Normalize state_holiday
sh = (
    df_file['state_holiday']
    .astype(str).str.strip().str.lower()
    .replace({'false': '0', 'none': '0', 'nan': '0'})
)
df_file['state_holiday'] = pd.Categorical(sh, categories=['0', 'a', 'b', 'c'])

print(df_file.isna().sum())

# Feature / Target Split

In [None]:
X = df_file.drop("sales", axis=1)
y = df_file["sales"]

# Scaling Numeric Features

In [None]:
scaler = StandardScaler()

X_numeric = X.drop(columns=['date', 'state_holiday'])
X_scaled = scaler.fit_transform(X_numeric)

X_scaled_df = pd.DataFrame(X_scaled, columns=X_numeric.columns)
X_scaled_df = pd.concat(
    [X[['date', 'state_holiday']].reset_index(drop=True), X_scaled_df],
    axis=1
)

# Train/Test Split (Random)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Time Series Cross-Validation

In [None]:
X_ts = X.copy()
for c in ['date', 'state_holiday']:
    if c in X_ts.columns:
        X_ts = X_ts.drop(columns=[c])

X_ts = X_ts.reset_index(drop=True)
y_ts = y.reset_index(drop=True)

tscv = TimeSeriesSplit(n_splits=5)

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestRegressor(
        n_estimators=200,
        random_state=42,
        n_jobs=-1
    ))
])

mse_scores = []
r2_scores = []

for fold, (train_idx, test_idx) in enumerate(tscv.split(X_ts), start=1):
    X_train, X_test = X_ts.iloc[train_idx], X_ts.iloc[test_idx]
    y_train, y_test = y_ts.iloc[train_idx], y_ts.iloc[test_idx]

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    mse_scores.append(mean_squared_error(y_test, y_pred))
    r2_scores.append(r2_score(y_test, y_pred))

    print(f"Fold {fold}: MSE={mse_scores[-1]:.4f}, R2={r2_scores[-1]:.4f}")

print("Average MSE:", np.mean(mse_scores))
print("Average R2:", np.mean(r2_scores))

# Fit final model
pipe.fit(X_ts, y_ts)
final_model = pipe

# Linear Regression Model

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)

print("\nTrain/Test Evaluation:")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))

# K-Fold Cross Validation

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

mse_scores = -cross_val_score(
    model, X_scaled, y, cv=kf,
    scoring='neg_mean_squared_error'
)
r2_scores = cross_val_score(
    model, X_scaled, y, cv=kf,
    scoring='r2'
)

print("\nK-Fold Cross Validation:")
print("Average MSE:", mse_scores.mean())
print("Average R2:", r2_scores.mean())

# Load Real Data & Predict

In [None]:
df_real_file = pd.read_csv("REAL_DATA.csv")

df_real_file = df_real_file.loc[:, ~df_real_file.columns.str.contains(r'^Unnamed')]
df_real_file['date'] = pd.to_datetime(df_real_file['date'], errors='coerce')
df_real_file.dropna(subset=["date"], inplace=True)

sh = (
    df_real_file['state_holiday']
    .astype(str).str.strip().str.lower()
    .replace({'false': '0', 'none': '0', 'nan': '0'})
)
df_real_file['state_holiday'] = pd.Categorical(sh, categories=['0', 'a', 'b', 'c'])

train_cols = X_ts.columns.tolist()
_real_temp = df_real_file.copy()

for c in ['date', 'state_holiday']:
    if c in _real_temp.columns and c not in train_cols:
        _real_temp = _real_temp.drop(columns=[c])

X_real = _real_temp.reindex(columns=train_cols).fillna(0)

df_real_file['sales_pred'] = pipe.predict(X_real)

print(df_real_file.head())