
# Fashion Forward Forecasting — End‑to‑End ML Pipeline

**Goal.** Train a pipeline that predicts whether a customer recommends a product using text reviews, numeric, and categorical features.  
This notebook is modular, follows best practices (no data leakage), performs hyperparameter tuning, and evaluates on a held‑out test set.


## 1. Imports & Config

In [None]:

from __future__ import annotations
from typing import List, Tuple
import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from src.data_utils import Config, load_data
from src.text_preproc import SpacyLemmatizer, text_joiner
from src.metrics import evaluate

CFG = Config()  # <-- change column names here if needed
RANDOM_STATE = CFG.random_state


## 2. Load Data

In [None]:

df = load_data(CFG)
df.head(3)


## 3. Train/Test Split (Stratified)

In [None]:

X = df[[*CFG.text_cols, *CFG.num_cols, *CFG.cat_cols]]
y = df[CFG.target].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=RANDOM_STATE
)
X_train.shape, X_test.shape, y_train.value_counts(normalize=True).round(3)


## 4. Preprocessing & Model Pipeline

In [None]:

from sklearn.pipeline import Pipeline

# Numeric pipeline
numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Categorical pipeline
categorical_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Text pipeline: join text cols -> spaCy lemmatization -> TF-IDF
text_pipe = Pipeline(steps=[
    ("join", text_joiner),
    ("lemma", SpacyLemmatizer(model=CFG.spacy_model)),
    ("tfidf", TfidfVectorizer(
        tokenizer=identity,
        preprocessor=identity,
        token_pattern=None
    ))
])

# ColumnTransformer to combine all branches
preprocess = ColumnTransformer(transformers=[
    ("num", numeric_pipe, list(CFG.num_cols)),
    ("cat", categorical_pipe, list(CFG.cat_cols)),
    ("txt", text_pipe, list(CFG.text_cols)),
])

# Final pipeline with classifier
pipeline = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", LogisticRegression(max_iter=2000))
])
pipeline


## 5. Hyperparameter Tuning (GridSearchCV)

In [None]:

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

param_grid = {
    "preprocess__txt__tfidf__min_df": [2, 5],
    "preprocess__txt__tfidf__ngram_range": [(1,1), (1,2)],
    "clf__C": [0.5, 1.0, 2.0, 5.0],
    "clf__penalty": ["l2"],
    "clf__solver": ["liblinear", "lbfgs"],
}

grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring="f1",
    cv=cv,
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)
print("Best params:", grid.best_params_)
print("Best CV F1:", round(grid.best_score_, 4))
best_model = grid.best_estimator_


## 6. Evaluation on Test Set

In [None]:

y_pred = best_model.predict(X_test)
# Proba for ROC-AUC if supported
try:
    y_proba = best_model.predict_proba(X_test)[:, 1]
except Exception:
    y_proba = None

evaluate(y_test, y_pred, y_proba)


## 7. Save Trained Pipeline

In [None]:

joblib.dump(best_model, CFG.model_out)
CFG.model_out



## 8. Next Steps (Ideas to Stand Out)
- Add POS/NER derived features (spaCy) as extra numeric signals.
- Plot simple visualizations (label distribution, top TF‑IDF terms).
- Try alternative classifiers (LinearSVC, RandomForest, XGBoost).
- Use `permutation_importance` on numeric/categorical subsets.
