# ML Training Notebook

## Setup

### Prepare venv!
Please restart VSCode and your kernel before selecting new venv as interpreter

In [1]:
%env PYTHONWARNINGS=ignore



In [2]:
# First off, probably best to run this to create a new virtual environment for you.
# This will also compile all of the .py modules I've written for access through Jupyter Notebooks.
# If you choose a different name other than the default please include it in your .gitignore file so it's not uploaded to Github
from pathlib import Path

def mk_venv(venv_name: str = '.venv'):
    root = Path.cwd().resolve()
    while not (root / '.git').exists():
        root = root.parent

    venv_path = root / venv_name
    if not Path(venv_path).exists():
        !python3 {root}/scripts/bootstrap_env.py {venv_path}
    return None

# Call it from here as one last namespace safety
mk_venv()

### Bring in dependencies

In [3]:
# External dependencies...
#   specifically ML modules here
from sklearn.base import clone
from sklearn.metrics import make_scorer, cohen_kappa_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import RidgeClassifier
from mord import LogisticIT
from lightgbm import LGBMRegressor

#   other important externals
from datetime import datetime
import pandas as pd
import numpy as np
import joblib
import json

#   my helper libraries
from core import get_settings
from ml_lib import (
    LGBMOrdinal,
    suppress_warnings, 
    binning_cats, cycle_dates,
    grid_to_pd, read_write_grid, expand_csv,
    full_est_scores, fast_est_scores, learning_curve_plot,
)

# Another major setting for sklearn
from sklearn import set_config
set_config(transform_output = 'pandas')

### Extracting data 
If non existent, it'll run the initial pipeline

In [4]:
cfg = get_settings()
csv_file = cfg.storage / 'clean_inspections.csv'
if not csv_file.is_file():
    !python3 {cfg.root}/scripts/run_etl.py get_data_csv
df = pd.read_csv(csv_file)

## Pandas Preprocessing

### Cleaning Bulk

#### Bin large categories

In [5]:
df.nunique()

camis                 26725
boro                      5
zipcode                 218
cuisine                  89
inspection_date        1479
inspection_type           9
inspection_subtype        6
violation_code          123
action                    5
critical_flag             3
score                   138
census_tract           1175
nta                     193
latitude              14037
longitude             13476
dtype: int64

In [6]:
binning_cats(df, 'zipcode', 1000)
binning_cats(df, 'census_tract', 650)
binning_cats(df, 'violation_code', 100)
binning_cats(df, 'cuisine', 300)
binning_cats(df, 'nta', 750)

df[['zipcode', 'census_tract']] = df[['zipcode', 'census_tract']].astype(str)

KeyboardInterrupt: 

#### Sort and create aggregate fields

In [None]:
df.sort_values(['camis', 'inspection_date'], inplace = True)

df['last_score'] = df.groupby('camis')['score'].shift(1)
df['rolling_mean_3'] = (
    df
        .groupby('camis')['score']
        .shift(1)
        .rolling(3)
        .mean()
        .reset_index(0, drop = True)
)

group_mean = df.groupby('camis')['score'].transform('mean')
df['last_score'] = df['last_score'].fillna(group_mean)
df['rolling_mean_3'] = df['rolling_mean_3'].fillna(group_mean)

#### Parse Date to create cyclical metrics

In [None]:
df['inspection_date'] = pd.to_datetime(df['inspection_date'])

df['year']          = df['inspection_date'].dt.year
df['month']         = df['inspection_date'].dt.month
df['dow']           = df['inspection_date'].dt.weekday
df['quart']         = df['inspection_date'].dt.quarter

df['is_weekend']    = df['dow'].isin([5, 6]).astype(int)

tau = 2 * np.pi
df = cycle_dates(df, 'dow',     (lambda cell: cell / 7))
df = cycle_dates(df, 'month',   (lambda cell: (cell - 1) / 12))
df = cycle_dates(df, 'quart', (lambda cell: (cell - 1) / 4))

#### Drop useless columns after parsing - Bin Targets for ordinal classification

In [None]:
df.drop(columns = ['month', 'dow', 'quart', 'camis', 'latitude', 'longitude'], inplace = True)

bins = [-1, 13, 27, float('inf')]
labels = [0, 1, 2]  # A=0, B=1, C=2
df['grade'] = pd.cut(df['score'], bins = bins, labels = labels).astype(int)
df.drop(columns = ['score'], inplace = True)

## Auto-Tuning

### Split via a hard date for testing

In [None]:
# Create hold out set
cutoff_date = pd.to_datetime('2025-02-17')
training_df = df[df['inspection_date'] <  cutoff_date]
testing_df  = df[df['inspection_date'] >= cutoff_date]

X_tr = training_df.drop(columns = ['inspection_date', 'grade'])
y_tr = training_df['grade']

X_te = testing_df.drop(columns = ['inspection_date', 'grade'])
y_te = testing_df['grade']

all_Xy = {
    'X_tr': X_tr,
    'y_tr': y_tr,
    'X_te': X_te,
    'y_te': y_te
}

## ML Scikit Based Preprocessing

### Setup the preprocess transformer and the pipeline object

In [None]:
cache = joblib.Memory('cache_dir', verbose = 0)

numeric_feats = ['last_score', 'rolling_mean_3']
cyclical_feats = ['dow_sin', 'dow_cos', 'month_sin', 'month_cos', 'quart_sin', 'quart_cos']
categorical_feats = [
    'boro', 'zipcode', 'cuisine', 'inspection_type', 
    'inspection_subtype', 'violation_code', 'action', 
    'critical_flag', 'census_tract', 'nta', 'year', 'is_weekend'
    ]

ml_prep = ColumnTransformer(
    [
        ('num', StandardScaler(), numeric_feats),
        ('cyc', 'passthrough', cyclical_feats),
        ('cat', OneHotEncoder(handle_unknown = 'ignore', sparse_output = False), categorical_feats),
    ],
)
ml_prep.set_output(transform = 'pandas')

ml_pipe = Pipeline(
    [
        ('prep', ml_prep),
        ('clf', LogisticIT())
    ],
    memory = cache
)

### Grid Searching with mord Ordinal Classifications

In [None]:
# --- ordinal logistic models ---
mord_grid = {
        'clf':                  [LogisticIT()],
        'clf__alpha':           [1.0],
        'clf__max_iter':        [250, 500],
    }

kappa_scorer = make_scorer(cohen_kappa_score, weights = 'quadratic')

mord_search = GridSearchCV(
    ml_pipe,
    mord_grid,
    cv = TimeSeriesSplit(n_splits = 3),
    scoring = kappa_scorer,
    n_jobs = 7
)
mord_search.fit(X_tr, y_tr)

In [None]:
read_write_grid(mord_search, overwrite = True)
fast_est_scores(mord_search, all_Xy)

              precision    recall  f1-score   support

           0       0.75      0.88      0.81      6705
           1       0.76      0.60      0.67      5708
           2       0.88      0.88      0.88      9199

    accuracy                           0.81     21612
   macro avg       0.79      0.79      0.79     21612
weighted avg       0.81      0.81      0.80     21612

Best Params: {'clf': LogisticIT(), 'clf__alpha': 1.0, 'clf__max_iter': 500}
Best Score: 0.787219912137263
Test Accuracy: 0.7936631933476272


In [None]:
expand_csv().sort_values('rank_test_score').head()

Unnamed: 0,clf,mean_fit_time,std_fit_time,mean_score_time,std_score_time,mean_test_score,std_test_score,rank_test_score,clf__alpha,clf__max_iter
1,LogisticIT(),254.085735,92.49522,6.525711,4.867087,0.78722,0.012186,1,1.0,500.0
0,LogisticIT(),249.095713,94.74922,8.118491,5.193504,0.786243,0.012951,2,0.1,500.0


In [None]:
# learning_curve_plot('mord_lc', mord_search, all_Xy, cv = 3)

### Finding the best Random Forest Classifier

In [None]:
# --- random forest baseline ---
randf_grid = {
        'clf':                  [RandomForestClassifier(random_state = 42)],
        'clf__n_estimators':    [100, 250],
        'clf__max_depth':       [15],
        'clf__min_samples_leaf':[1, 3],
        'clf__class_weight':    ['balanced'],
    }

randf_search = GridSearchCV(
    ml_pipe,
    randf_grid,
    cv = TimeSeriesSplit(n_splits = 3),
    scoring = kappa_scorer,
    n_jobs = 7
)
randf_search.fit(X_tr, y_tr)

In [None]:
read_write_grid(randf_search)
fast_est_scores(randf_search, all_Xy)

              precision    recall  f1-score   support

           0       0.86      0.78      0.82      6705
           1       0.79      0.84      0.82      5708
           2       0.90      0.93      0.92      9199

    accuracy                           0.86     21612
   macro avg       0.85      0.85      0.85     21612
weighted avg       0.86      0.86      0.86     21612

Best Params: {'clf': RandomForestClassifier(random_state=42), 'clf__class_weight': 'balanced', 'clf__max_depth': 15, 'clf__min_samples_leaf': 1, 'clf__n_estimators': 250}
Best Score: 0.8023407493804496
Test Accuracy: 0.8175556421660533


In [None]:
expand_csv().sort_values('rank_test_score').head()

Unnamed: 0,clf,mean_fit_time,std_fit_time,mean_score_time,std_score_time,mean_test_score,std_test_score,rank_test_score,clf__alpha,clf__max_iter,clf__class_weight,clf__max_depth,clf__min_samples_leaf,clf__n_estimators
1,LogisticIT(),254.085735,92.49522,6.525711,4.867087,0.78722,0.012186,1,1.0,500.0,,,,
3,RandomForestClassifier(random_state=42),98.66883,46.320088,5.543895,0.435307,0.802341,0.011184,1,,,balanced,15.0,1.0,250.0
0,LogisticIT(),249.095713,94.74922,8.118491,5.193504,0.786243,0.012951,2,0.1,500.0,,,,
5,RandomForestClassifier(random_state=42),98.619535,35.43061,4.900587,1.321513,0.799888,0.010035,2,,,balanced,15.0,3.0,250.0
4,RandomForestClassifier(random_state=42),42.849815,22.040966,3.590553,0.62895,0.793957,0.014786,3,,,balanced,15.0,3.0,100.0


In [None]:
# learning_curve_plot('randf_lc', randf_search, all_Xy, cv = 3)

### Focus on LGBM

#### Custom Wrapper for LGBMRegressor for Ordinal Classification

In [None]:
# to_numpy = FunctionTransformer(lambda X: X.values, validate=False)
# def df_to_numpy(X):
#     return X.values

prep: ColumnTransformer = clone(ml_prep)
prep.set_output(transform = 'pandas')
lgbm_pipe = Pipeline(
    [
        ('prep', prep),
        ('to_numpy', FunctionTransformer(func = pd.DataFrame.to_numpy, validate = False)),
        ('clf', LGBMOrdinal())
    ],
    memory = cache
)
lgbm_grid = [
    # --- gradient-boosting regressor + round-to-ordinal trick ---
    {
        'clf':                          [LGBMOrdinal(random_state = 42, verbosity = -1)],
        'clf__n_estimators':            [100, 200],
        'clf__max_depth':               [5, 7, 9],
        'clf__learning_rate':           [0.1, 1.0],
        'clf__reg_lambda':              [0.1, 1],
    },
]

with suppress_warnings():
    lgbm_search = GridSearchCV(
        lgbm_pipe,
        lgbm_grid,
        cv = TimeSeriesSplit(n_splits = 3),
        scoring = kappa_scorer,
        n_jobs = 7
    )
    lgbm_search.fit(X_tr, y_tr)

In [None]:
read_write_grid(lgbm_search)
fast_est_scores(lgbm_search, all_Xy)

              precision    recall  f1-score   support

           0       0.89      0.84      0.86      6705
           1       0.72      0.90      0.80      5708
           2       0.95      0.85      0.90      9199

    accuracy                           0.86     21612
   macro avg       0.85      0.86      0.85     21612
weighted avg       0.87      0.86      0.86     21612

Best Params: {'clf': LGBMOrdinal(random_state=42, verbosity=-1), 'clf__learning_rate': 0.1, 'clf__max_depth': 9, 'clf__n_estimators': 100, 'clf__reg_lambda': 1}
Best Score: 0.8351751366378526
Test Accuracy: 0.8471790424336478


In [None]:
full_est_scores(lgbm_search, all_Xy)

Cohen Kappa Score: 0.7837930897199585
Gen Gap (acc): -0.0022786317281476842
MAE train: 0.17618526757366829
MAE test:  0.16597260781047565
QWK (Cohen’s kappa) train: 0.8432655577072156
QWK test:  0.8471790424336478


In [None]:
# learning_curve_plot('lgbm_lc', lgbm_search, all_Xy, cv = 3)

### Analyze based on CSV Results

In [None]:
expand_csv().sort_values('rank_test_score').head()

Unnamed: 0,clf,mean_fit_time,std_fit_time,mean_score_time,std_score_time,mean_test_score,std_test_score,rank_test_score,clf__alpha,clf__max_iter,clf__class_weight,clf__max_depth,clf__min_samples_leaf,clf__n_estimators,clf__learning_rate,clf__reg_lambda
1,LogisticIT(),254.085735,92.49522,6.525711,4.867087,0.78722,0.012186,1,1.0,500.0,,,,,,
3,RandomForestClassifier(random_state=42),98.66883,46.320088,5.543895,0.435307,0.802341,0.011184,1,,,balanced,15.0,1.0,250.0,,
15,"LGBMOrdinal(random_state=42, verbosity=-1)",24.419051,8.894413,3.30648,0.176057,0.835175,0.015302,1,,,,9.0,,100.0,0.1,1.0
0,LogisticIT(),249.095713,94.74922,8.118491,5.193504,0.786243,0.012951,2,0.1,500.0,,,,,,
14,"LGBMOrdinal(random_state=42, verbosity=-1)",22.432293,9.367585,3.430388,0.041439,0.835102,0.015455,2,,,,9.0,,100.0,0.1,0.1


### Attempt Training Directly

In [None]:
with suppress_warnings():

    estimators = [
        ('logit', mord_search.best_estimator_),
        ('randf', randf_search.best_estimator_),
        ('lgbm', lgbm_search.best_estimator_)
    ]

    s = StackingClassifier(
        estimators = estimators,
        final_estimator = RidgeClassifier(alpha = 1.0),
        cv = 3,
        passthrough = False
    )
    stack_prep: ColumnTransformer = clone(ml_prep)
    stack_prep.set_output(transform = 'pandas')
    
    stack = Pipeline(
        [
            ('prep', stack_prep),
            ('stack', s),
        ],
        memory = cache
    )
    stack.fit(X_tr, y_tr)

ValueError: A given column is not a column of the dataframe

In [None]:
with suppress_warnings():
    print(stack.score(X_te, y_te))

0.8855728299093096


In [None]:
# learning_curve_plot('stack_lc', stack, all_Xy, cv = 3)

In [None]:
with suppress_warnings():
    print(classification_report(y_te, stack.predict(X_te)))

              precision    recall  f1-score   support

           0       0.85      0.87      0.86      6705
           1       0.89      0.84      0.86      5708
           2       0.91      0.92      0.92      9199

    accuracy                           0.89     21612
   macro avg       0.88      0.88      0.88     21612
weighted avg       0.89      0.89      0.89     21612



In [None]:
full_est_scores(stack, all_Xy)

Cohen Kappa Score: 0.824376265849629
Gen Gap (acc): -0.019066752506922913
MAE train: 0.17123239970340146
MAE test:  0.15912456042939108
QWK (Cohen’s kappa) train: 0.8271419891605662
QWK test:  0.8297378311697295


In [None]:
name = 'curry_inspector_20250514_v1'
model = f'{name}.joblib'
json_ = f'{name}_meta.json'

json_path = cfg.storage / json_
pipe_path = cfg.storage / model

_s = stack.named_steps['stack']
meta = {
    'model_file': model,
    'train_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'estimators': [name for name, _ in _s.estimators],
    'final_estimator': type(_s.final_estimator).__name__,
    'cv_folds': _s.cv
}
with open(json_path, 'w') as f:
    json.dump(meta, f, indent=2)


joblib.dump(stack, pipe_path, compress = ('gzip', 3))

['/Users/neelagarwal/Projects/DataClassRepos/CurryInspection/resources/ci_pipe_20250514.joblib']