
# Fashion Forward Forecasting — End‑to‑End ML Pipeline

**Goal.** Train a pipeline that predicts whether a customer recommends a product using text reviews, numeric, and categorical features.  
This notebook is modular, follows best practices (no data leakage), performs hyperparameter tuning, and evaluates on a held‑out test set.


## 1. Imports & Config

In [1]:

from __future__ import annotations
from typing import List, Tuple
import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from src.text_preproc import SpacyLemmatizer, text_joiner, identity

from src.data_utils import Config, load_data
from src.metrics import evaluate

CFG = Config()  # <-- change column names here if needed
RANDOM_STATE = CFG.random_state


## 2. Load Data

In [2]:

df = load_data(CFG)
df.head(3)


Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses


## 3. Train/Test Split (Stratified)

In [3]:

X = df[[*CFG.text_cols, *CFG.num_cols, *CFG.cat_cols]]
y = df[CFG.target].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=RANDOM_STATE
)
X_train.shape, X_test.shape, y_train.value_counts(normalize=True).round(3)


((18788, 8),
 (4698, 8),
 Recommended IND
 1    0.822
 0    0.178
 Name: proportion, dtype: float64)

## 4. Preprocessing & Model Pipeline

In [4]:

from sklearn.pipeline import Pipeline

# Numeric pipeline
numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Categorical pipeline
categorical_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Text pipeline: join text cols -> spaCy lemmatization -> TF-IDF
text_pipe = Pipeline(steps=[
    ("join", text_joiner),
    ("lemma", SpacyLemmatizer(model=CFG.spacy_model)),
    ("tfidf", TfidfVectorizer(
        tokenizer=identity,
        preprocessor=identity,
        token_pattern=None,
        lowercase=False,
        max_features=20000   # <= limita dimensionalidad para ahorrar memoria/recursos
    ))
])

# ColumnTransformer to combine all branches
preprocess = ColumnTransformer(transformers=[
    ("num", numeric_pipe, list(CFG.num_cols)),
    ("cat", categorical_pipe, list(CFG.cat_cols)),
    ("txt", text_pipe, list(CFG.text_cols)),
])

# Final pipeline with classifier
pipeline = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", LogisticRegression(max_iter=2000))
])
pipeline


0,1,2
,steps,"[('preprocess', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,func,<function joi...001A935AFAD40>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,model,'en_core_web_sm'
,lowercase,True

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,False
,preprocessor,<function ide...001A935AFA660>
,tokenizer,<function ide...001A935AFA660>
,analyzer,'word'
,stop_words,
,token_pattern,

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,2000


## 5. Hyperparameter Tuning (GridSearchCV)

In [5]:

# === Stratified CV ===
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# === Param grid (compacto y seguro en Windows) ===
param_grid = {
    "preprocess__txt__tfidf__min_df": [3],        # evita errores con docs escasos
    "preprocess__txt__tfidf__max_df": [0.95],
    "preprocess__txt__tfidf__ngram_range": [(1,1), (1,2)],
    "clf__C": [1.0, 2.0],
    "clf__penalty": ["l2"],
    "clf__solver": ["liblinear"],                 # estable y ligero
}

# === GridSearch secuencial (clave para WinError 1450) ===
grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring="f1",
    cv=cv,
    n_jobs=1,                 # <- SIN paralelismo (evita WinError 1450)
    verbose=1,
    error_score="raise"       # si algo falla, veremos el error real
)

best_model = grid.fit(X_train, y_train).best_estimator_
print("Best params:", grid.best_params_)
print("Best CV F1:", round(grid.best_score_, 4))

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best params: {'clf__C': 1.0, 'clf__penalty': 'l2', 'clf__solver': 'liblinear', 'preprocess__txt__tfidf__max_df': 0.95, 'preprocess__txt__tfidf__min_df': 3, 'preprocess__txt__tfidf__ngram_range': (1, 1)}
Best CV F1: 0.9647


## 6. Evaluation on Test Set

In [6]:

y_pred = best_model.predict(X_test)
# Proba for ROC-AUC if supported
try:
    y_proba = best_model.predict_proba(X_test)[:, 1]
except Exception:
    y_proba = None

evaluate(y_test, y_pred, y_proba)


Accuracy : 0.9395
Precision: 0.9616
Recall   : 0.9651
F1-score : 0.9633

Classification Report:
               precision    recall  f1-score   support

           0     0.8356    0.8216    0.8285       835
           1     0.9616    0.9651    0.9633      3863

    accuracy                         0.9395      4698
   macro avg     0.8986    0.8933    0.8959      4698
weighted avg     0.9392    0.9395    0.9393      4698

Confusion Matrix:
 [[ 686  149]
 [ 135 3728]]
ROC-AUC  : 0.9778


(0.9395487441464453,
 0.9615682228527211,
 0.9650530675640694,
 0.9633074935400516,
 0.9777590560530506)

## 7. Save Trained Pipeline

In [8]:

import os
# Create directory if it does not exist
os.makedirs("models", exist_ok=True)

# Save model
import joblib
joblib.dump(best_model, CFG.model_out)

print("Saved at:", CFG.model_out)

Saved at: models/model_pipeline.pkl



## 8. Next Steps (Ideas to Stand Out)
- Add POS/NER derived features (spaCy) as extra numeric signals.
- Plot simple visualizations (label distribution, top TF‑IDF terms).
- Try alternative classifiers (LinearSVC, RandomForest, XGBoost).
- Use `permutation_importance` on numeric/categorical subsets.
