# ML Training Notebook

## Import dependencies

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import mord

from pathlib import Path
import datetime as dt
import pandas as pd
import csv
import json
import uuid

In [4]:
path = Path('../../cleaned_inspections.csv')
df = pd.read_csv(path)

### Modeling Notes:

#### A few options on binning:

1. Regression on the raw score
    - What it does: Predicts the exact numeric score (0–100+).
    - Why it’s powerful: Uses the full continuum of the target, so you’re not discarding any nuance.
    - Trade‑off: You have to choose your pass/fail or A/B/C thresholds after you fit the model (but you can even tune those thresholds on a hold‑out set).

2. Ordinal‑aware multi‑class
    - What it does: Predicts ordered buckets (e.g. A/B/C → 0/1/2) while explicitly modeling their order.
    - Why it helps: You still collapse the score into 3 groups, but your loss function “knows” that mis‑predicting A→B is a smaller error than A→C. That extra structure often boosts classification performance versus treating classes as unrelated.

3. Plain multi‑class
    - What it does: Predicts A, B, or C as independent labels.
    - Why it’s weaker: Loses both granularity (all within‑class differences) and ordering information.

4. Binary (fail/pass)
    - What it does: Predicts whether score ≥ 28.
    - Why it’s simplest: Straightforward, but you throw out nearly all of the score’s information (e.g. you treat a 27 the same as a 0).

#### A starting pathway for choosing the target approach:

1. **Start simple with the binary flag**

   * Create

     ```python
     df['failing'] = (df.score >= 28).astype(int)
     ```
   * Train a classifier (e.g. logistic regression or random forest) and evaluate ROC‑AUC / F1 on “fail.”
   * You’ll get a baseline that directly answers “who fails?” with minimal fuss.

2. **If you need more insight, step up to ordinal**

   * Map scores into A/B/C:

     ```python
     bins = [ -1, 13, 27, float('inf') ]
     labels = ['A','B','C']
     df['grade'] = pd.cut(df.score, bins=bins, labels=labels)
     ```
   * Use an **ordinal** model (e.g. `statsmodels`’ OrdinalLogit) or transform into multiple binary tasks (cumulative link).
   * This lets you exploit the fact that mis‐classifying A→B is “less wrong” than A→C.

3. **Regression if you really care about exact scores**

   * Predict `score` directly, then choose your cutoff(s) in post‑processing.
   * You can even treat the cutoff as a hyperparameter and tune it on your validation set for best classification metrics.


##### Why not jump straight to multi‑class/ordinal?

* **Complexity**: True ordinal methods require special loss functions or libraries.
* **Data needs**: More classes mean fewer examples per class, which can hurt performance.
* **Interpretability**: Stakeholders often just want “pass/fail.”


##### How ordinal vs. numeric thresholding differ

* A **plain multi‑class tree** treats A, B, C as unrelated labels.  You’d need to encode order (e.g. A→0, B→1, C→2) and accept that your model is really doing regression on those integers.
* A **true ordinal** approach (cumulative link models, ordinal forest, etc.) explicitly penalizes “distance” between predicted and true classes in its loss.


**Bottom line:**

* **If your goal is simply “which restaurants fail?”**, go with the binary flag at 28.
* **If you want richer predictions on letter grade**, do the ordinal multi‑class next (but be aware you’ll need an ordinal‐aware method to fully exploit ordering).
* **If you care about exact score predictions (and may want different thresholds later)**, build a regression model and threshold afterward.

## Preprocessing

#### Classes To Help

In [5]:
class ML_Helper:
    def __init__(self, df: pd.DataFrame):
        self.df = df.copy()
        self.target = 'score'

    def pass_fail_bins(self) -> pd.DataFrame:
        if self.target == 'score':    
            self.df['failing'] = (self.df[self.target] >= 28).astype(int)
            self.df.drop(columns = self.target, inplace = True)
            self.target = 'failing'
            return self
        else:
            print('Could not finish. Please ensure .ordinal_bins() has not already be run.')

    def ordinal_bins(self) -> pd.DataFrame:
        if self.target == 'score':
            bins = [-1, 13, 27, float('inf')]
            labels = [0, 1, 2]  # A=0, B=1, C=2
            self.df['grade'] = pd.cut(self.df[self.target], bins = bins, labels = labels).astype(int)
            self.df.drop(columns = [self.target], inplace = True)
            self.target = 'grade'
            return self
        else:
            print('Could not finish. Please ensure .pass_fail_bins() has not already be run.')

    def target_split(self) -> tuple[pd.DataFrame, pd.Series]:
        return self.df.drop(columns = [self.target]), self.df[self.target]

In [None]:
class Log_Training:
    '''
        Logs experiments to a per-user CSV. Each row contains:
        - timestamp (ISO)
        - run_id (UUID4)
        - model_name (class name)
        - params (JSON dict)
        - metrics (JSON dict)
        - extra (JSON dict for anything else you want to track)
    '''

    def __init__(self, classmate_name: str, log_dir = Path('logs')):
        self.classmate = classmate_name
        log_dir.mkdir(exist_ok = True)
        log_stem = f'{classmate_name}_models.csv'
        self.log_path = log_dir / log_stem

        # If the file doesn’t exist, write a header
        if not self.log_path.is_file():
            with open(self.log_path, 'w', newline='') as f:
                writer = csv.writer(f)
                writer.writerow(['timestamp','run_id','model_name','params','metrics','extra'])

    def log(self,
            model=None,
            *,
            model_name: str = None,
            params: dict = None,
            metrics: dict = None,
            extra: dict = None
        ):
        '''
        Write one experiment record.

        If you pass a scikit‑learn–style `model`, its .get_params() will be recorded automatically.
        Otherwise, supply model_name and params explicitly.
        '''
        # Determine the name
        name = model_name or (model.__class__.__name__ if model is not None else '<unknown>')

        # Get params
        if params is None:
            if model is not None and hasattr(model, 'get_params'):
                params = getattr(model, 'get_params')
            else:
                try:
                    params = getattr(model, '__dict__', {})
                except:
                    params = {}
        # Default metrics/extra
        metrics = metrics or {}
        extra   = extra   or {}
        row = [
            dt.datetime.now(dt.UTC).isoformat(),
            str(uuid.uuid4()),
            name,
            json.dumps(params,  separators=('',','':'), default=str),
            json.dumps(metrics, separators=(',',':'), default=str),
            json.dumps(extra,   separators=(',',':'), default=str),
        ]

        with open(self.log_path, 'a', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(row)

        return None

    def train_and_log_mord(self, X, y, test_size, **mord_kwargs):
        Xtr, Xte, ytr, yte = train_test_split(X, y, test_size = test_size, stratify = y)
        m = mord.LogisticIT(**mord_kwargs).fit(Xtr, ytr)
        preds = m.predict(Xte)
        metrics = {'accuracy': accuracy_score(yte, preds)}
        self.log(model=m, metrics=metrics)
        return print(metrics)

#### Using ML_Helper

- Pass in a df to the constructor and save it to a variable of your choice: `ml = ML_Helper(df)`  

- Call either `ml.pass_fail_bins()` or `ml.ordinal_bins()`, or neither for full score regression. You cannot call both, it won't work.  
    + This step is to take our target variable (which is the inspection score) and either convert it to a pass fail, or to the letter grade  

- Lastly, call `X, y = ml.target_split()` to get an automatic split of the X and y variables.

### Run Trainings Here!

After every run, please make sure to create a Log_Training object and run train_and_log_mord() to save the results of your model training/experimentation to a CSV log.

In [16]:
ml = ML_Helper(df)
# ml.pass_fail_bins()
# ml.ordinal_bins()
X, y = ml.target_split()

In [15]:
y

0         1
1         0
2         0
3         0
4         0
         ..
264770    1
264771    0
264772    0
264773    0
264774    0
Name: failing, Length: 264775, dtype: int64