# **Stories Modeling Preprocessing**

Separate notebook for model-ready preprocessing: cleaning alignment, encoding, normalization, train/test split, and baseline models.

## **Load Cleaned Data**

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd

BASE_PATH = Path('/Users/mohamad22/Desktop/EECE_490_hackathon/Archive/Stories_data/cleaned')
if not BASE_PATH.exists():
    if Path('./cleaned').exists():
        BASE_PATH = Path('./cleaned')
    else:
        BASE_PATH = Path('./Archive/Stories_data/cleaned')

print(f'Using data folder: {BASE_PATH.resolve()}')

df_month_long = pd.read_csv(BASE_PATH / 'rep_00134_comparative_monthly_sales_clean_long.csv')
print('monthly_sales_long shape:', df_month_long.shape)
df_month_long.head()


## **Modeling-Ready Preprocessing (Encoding + Normalization)**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# 1) Build modeling table
# Target: sales_amount (branch-month level)
df_model = df_month_long[
    (df_month_long['row_type'] == 'branch') &
    (df_month_long['period_type'] == 'month')
].copy()

# Ensure month_number is populated
month_map = {
    'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5, 'june': 6,
    'july': 7, 'august': 8, 'september': 9, 'october': 10, 'november': 11, 'december': 12
}
df_model['period'] = df_model['period'].astype(str).str.lower().str.strip()
df_model['month_number'] = pd.to_numeric(df_model['month_number'], errors='coerce')
df_model['month_number'] = df_model['month_number'].fillna(df_model['period'].map(month_map))

# Optional time feature
df_model['quarter'] = ((df_model['month_number'] - 1) // 3 + 1).astype('Int64')

# 2) Features / target
feature_cols = ['branch', 'period', 'year', 'month_number', 'quarter']
target_col = 'sales_amount'

X = df_model[feature_cols].copy()
y = pd.to_numeric(df_model[target_col], errors='coerce')

# Drop rows with missing target
valid_idx = y.notna()
X = X.loc[valid_idx]
y = y.loc[valid_idx]

# 3) Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

cat_cols = ['branch', 'period']
num_cols = ['year', 'month_number', 'quarter']

# 4) Preprocessor for scale-sensitive models (ENCODING + NORMALIZATION)
preprocessor_scaled = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), num_cols),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), cat_cols),
    ]
)

# 5) Preprocessor for tree models (ENCODING only, no normalization)
preprocessor_tree = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median'))
        ]), num_cols),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), cat_cols),
    ]
)

# 6) Example models
ridge_model = Pipeline([
    ('prep', preprocessor_scaled),
    ('model', Ridge(alpha=1.0))
])

rf_model = Pipeline([
    ('prep', preprocessor_tree),
    ('model', RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1))
])

ridge_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)

pred_ridge = ridge_model.predict(X_test)
pred_rf = rf_model.predict(X_test)

print('Rows used for modeling:', len(df_model))
print('Train rows:', len(X_train), '| Test rows:', len(X_test))
print('Ridge  | MAE:', round(mean_absolute_error(y_test, pred_ridge), 2), '| R2:', round(r2_score(y_test, pred_ridge), 4))
print('RF     | MAE:', round(mean_absolute_error(y_test, pred_rf), 2), '| R2:', round(r2_score(y_test, pred_rf), 4))


## **Notes**

- Use `preprocessor_scaled` for models sensitive to scale (linear models, SVM, KNN, PCA).
- Use `preprocessor_tree` for tree models (Random Forest / XGBoost style), where scaling is typically unnecessary.
- Keep all preprocessing inside pipelines to avoid data leakage.