# FurEverAI retrain: Simplified ANN preprocessing and expanded NB vectorizer

This notebook retrains two models while preserving the server contract and improving robustness:

- ANN Deep Match (predict_proba): a single scikit-learn Pipeline with ColumnTransformer for preprocessing (no double-encoding) and MLPClassifier; exports:
  - ann_pipeline.pkl
  - ann_features.json (locked feature schema matching get_dummies-style names)
  - metrics.json (evaluation)

- Naive Bayes Auto-Tag: TF-IDF with expanded n-grams and vocabulary + OneVsRest MultinomialNB; exports:
  - nb_vectorizer.pkl
  - nb_model.pkl
  - tags_schema.json (class order, vocab hash)

Artifacts are written to ../models/ so the Flask server can load them directly.

Tip: Keep scikit-learn==1.6.1 to match the server.

In [None]:
# Setup: capture environment and seeds
import os, sys, json, hashlib, random, platform
from pathlib import Path
import numpy as np

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# Print versions
print('Python:', sys.version)
try:
    import sklearn
    print('scikit-learn:', sklearn.__version__)
except Exception as e:
    print('scikit-learn: not installed', e)
try:
    import pandas as pd
    print('pandas:', pd.__version__)
except Exception as e:
    print('pandas: not installed', e)
try:
    import torch
    print('torch:', torch.__version__)
except Exception:
    print('torch: not installed (optional)')
print('OS:', platform.platform())

# Freeze current environment for reproducibility
req_path = Path.cwd() / 'requirements_lock.txt'
try:
    import subprocess
    with open(req_path, 'w', encoding='utf-8') as f:
        subprocess.run([sys.executable, '-m', 'pip', 'freeze'], stdout=f, check=True)
    print('Wrote', req_path)
except Exception as e:
    print('pip freeze failed:', e)

In [None]:
# Load data and create splits
from typing import List, Dict
import pandas as pd
from sklearn.model_selection import train_test_split

# Paths (customize these or drop CSVs in server/notebooks/data/)
ROOT = Path.cwd()
DATA_DIR = ROOT / 'data'
DATA_DIR.mkdir(exist_ok=True)
MODELS_DIR = (ROOT / '..' / 'models').resolve()
MODELS_DIR.mkdir(exist_ok=True)

# Structured ANN dataset expectations (columns); a small synthetic fallback is created if CSV not present.
ann_csv = DATA_DIR / 'ann_training.csv'
# Expected feature columns
NUM_COLS = ['Activity_Level','Has_Kids','Pet_Energy_Level','Pet_Good_With_Kids']
CAT_COLS = ['Experience_Level','Pet_Size','Pet_Grooming_Needs']
TARGET_COL = 'Match_Label'
ALL_COLS = NUM_COLS + CAT_COLS + [TARGET_COL]

if ann_csv.exists():
    ann_df = pd.read_csv(ann_csv)
    missing = [c for c in ALL_COLS if c not in ann_df.columns]
    if missing:
        raise ValueError(f'Missing ANN columns in {ann_csv}: {missing}')
    ann_df = ann_df[ALL_COLS].copy()
else:
    # Synthetic fallback
    rng = np.random.default_rng(SEED)
    n = 1000
    ann_df = pd.DataFrame({
        'Activity_Level': rng.integers(1,4,size=n),
        'Has_Kids': rng.integers(0,2,size=n),
        'Pet_Energy_Level': rng.integers(1,4,size=n),
        'Pet_Good_With_Kids': rng.integers(0,3,size=n),
        'Experience_Level': rng.choice(['First_Time','Past_Owner','Expert'], size=n),
        'Pet_Size': rng.choice(['Small','Medium','Large'], size=n),
        'Pet_Grooming_Needs': rng.choice(['Low','Medium','High'], size=n),
    })
    # Simple target rule + noise
    y = ((ann_df['Activity_Level'] >= 2) & (ann_df['Pet_Energy_Level'] >= 2)).astype(int)
    y = (y ^ (rng.random(n) < 0.1)).astype(int)
    ann_df[TARGET_COL] = y
    ann_df.to_csv(ann_csv, index=False)
    print('Created synthetic ANN dataset at', ann_csv)

X_ann = ann_df[NUM_COLS + CAT_COLS].copy()
y_ann = ann_df[TARGET_COL].astype(int).values

X_train_ann, X_temp_ann, y_train_ann, y_temp_ann = train_test_split(
    X_ann, y_ann, test_size=0.3, random_state=SEED, stratify=y_ann)
X_valid_ann, X_test_ann, y_valid_ann, y_test_ann = train_test_split(
    X_temp_ann, y_temp_ann, test_size=0.5, random_state=SEED, stratify=y_temp_ann)

# Text/Tags dataset for NB (description -> tags); expects nb_training.csv with 'description' and 'tags' (comma-separated).
nb_csv = DATA_DIR / 'nb_training.csv'
if nb_csv.exists():
    nb_df = pd.read_csv(nb_csv)
    if 'description' not in nb_df.columns or 'tags' not in nb_df.columns:
        raise ValueError('nb_training.csv must have columns: description, tags (comma-separated)')
else:
    nb_df = pd.DataFrame({
        'description': [
            'Energetic dog loves running and playing fetch, great with kids in apartments',
            'Calm senior cat enjoys quiet naps, perfect for apartment living',
            'Playful puppy high energy needs yard and active family',
            'Gentle cat independent and calm ideal for small apartment',
        ],
        'tags': [
            'high_energy,good_with_kids,apartment_friendly',
            'senior,apartment_friendly,calm',
            'high_energy,young,needs_yard',
            'calm,independent,apartment_friendly',
        ]
    })
    nb_df.to_csv(nb_csv, index=False)
    print('Created synthetic NB dataset at', nb_csv)

In [None]:
# ANN preprocessing pipeline: single encoding (no double-encoding)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline

# Build preprocessor
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
scaler = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, NUM_COLS),
        ('cat', ohe, CAT_COLS),
    ],
    remainder='drop',
)

# MLP classifier (scikit-learn)
mlp = MLPClassifier(hidden_layer_sizes=(64,32), activation='relu', solver='adam',
                    random_state=SEED, max_iter=300, early_stopping=True, n_iter_no_change=10)

ann_pipeline = Pipeline(steps=[('pre', preprocessor), ('clf', mlp)])

# Fit only on the train split
ann_pipeline.fit(X_train_ann, y_train_ann)
valid_acc = ann_pipeline.score(X_valid_ann, y_valid_ann)
print('ANN valid accuracy:', round(valid_acc, 4))

In [None]:
# Validate no double-encoding; derive stable ann_features matching get_dummies style
import itertools

# Get feature names from transformers (for uniqueness check)
ct: ColumnTransformer = ann_pipeline.named_steps['pre']
# Feature names with prefixes (e.g., 'num__', 'cat__')
feat_prefixed = ct.get_feature_names_out()
assert len(set(feat_prefixed)) == len(feat_prefixed), 'Duplicate features after transform'

# Derive get_dummies-style names for ann_features.json
# numeric -> original names; categorical -> f"{col}_{category}"
ohe: OneHotEncoder = ct.named_transformers_['cat']
cat_feature_names = []
for col, cats in zip(CAT_COLS, ohe.categories_):
    cat_feature_names.extend([f"{col}_{str(c)}" for c in cats])

ann_features = NUM_COLS + cat_feature_names
assert len(set(ann_features)) == len(ann_features), 'Duplicate names in ann_features'
print('ann_features length:', len(ann_features))

In [None]:
# Persist ann_features.json atomically with hash
from filelock import FileLock
from datetime import datetime

features_path = (MODELS_DIR / 'ann_features.json')
content = {
    'generated_at': datetime.utcnow().isoformat() + 'Z',
    'schema': ann_features,
}
content_bytes = json.dumps(content, ensure_ascii=False, separators=(',',':')).encode('utf-8')
sha = hashlib.sha256(content_bytes).hexdigest()
content['sha256'] = sha

lock = FileLock(str(features_path) + '.lock')
with lock:
    tmp = features_path.with_suffix('.tmp')
    with open(tmp, 'w', encoding='utf-8') as f:
        json.dump(content, f, ensure_ascii=False, indent=2)
    os.replace(tmp, features_path)
print('Wrote', features_path, 'sha256=', sha[:12], '…')

In [None]:
# ANN evaluation and artifact export
from sklearn import metrics
import joblib

# Evaluate
y_pred_valid = ann_pipeline.predict(X_valid_ann)
y_proba_valid = ann_pipeline.predict_proba(X_valid_ann)[:,1]
y_pred_test = ann_pipeline.predict(X_test_ann)
y_proba_test = ann_pipeline.predict_proba(X_test_ann)[:,1]

metrics_payload = {
    'valid': {
        'accuracy': float(metrics.accuracy_score(y_valid_ann, y_pred_valid)),
        'f1': float(metrics.f1_score(y_valid_ann, y_pred_valid)),
        'roc_auc': float(metrics.roc_auc_score(y_valid_ann, y_proba_valid)),
    },
    'test': {
        'accuracy': float(metrics.accuracy_score(y_test_ann, y_pred_test)),
        'f1': float(metrics.f1_score(y_test_ann, y_pred_test)),
        'roc_auc': float(metrics.roc_auc_score(y_test_ann, y_proba_test)),
    },
}
with open(MODELS_DIR / 'ann_metrics.json', 'w', encoding='utf-8') as f:
    json.dump(metrics_payload, f, indent=2)
print('ANN metrics:', metrics_payload)

# Export pipeline
ann_pipeline_path = MODELS_DIR / 'ann_pipeline.pkl'
joblib.dump(ann_pipeline, ann_pipeline_path)
print('Saved', ann_pipeline_path)

In [None]:
# NB vectorizer expansion (n-grams, features) and training
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier

# Prepare multi-label targets
def parse_tags(s: str) -> list:
    return [t.strip() for t in (s or '').split(',') if t.strip()]

nb_df['tag_list'] = nb_df['tags'].map(parse_tags)
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(nb_df['tag_list'])
texts = nb_df['description'].astype(str).tolist()

# Split deterministically
X_train_txt, X_temp_txt, Y_train, Y_temp = train_test_split(texts, Y, test_size=0.3, random_state=SEED)
X_valid_txt, X_test_txt, Y_valid, Y_test = train_test_split(X_temp_txt, Y_temp, test_size=0.5, random_state=SEED)

# Vectorizer with richer features
vectorizer = TfidfVectorizer(ngram_range=(1,2), stop_words='english', max_features=50000, min_df=2, max_df=0.9, sublinear_tf=True)

X_train_vec = vectorizer.fit_transform(X_train_txt)
X_valid_vec = vectorizer.transform(X_valid_txt)
X_test_vec = vectorizer.transform(X_test_txt)

# NB classifier
nb = OneVsRestClassifier(MultinomialNB(alpha=0.5))
nb.fit(X_train_vec, Y_train)

# Evaluate
from sklearn.metrics import f1_score, precision_score, recall_score
def eval_nb(X, Y_true, split):
    Y_pred = (nb.predict_proba(X) >= 0.5).astype(int)
    metrics_dict = {
        'micro_f1': float(f1_score(Y_true, Y_pred, average='micro', zero_division=0)),
        'macro_f1': float(f1_score(Y_true, Y_pred, average='macro', zero_division=0)),
        'micro_precision': float(precision_score(Y_true, Y_pred, average='micro', zero_division=0)),
        'micro_recall': float(recall_score(Y_true, Y_pred, average='micro', zero_division=0)),
    }
    print(split, metrics_dict)
    return metrics_dict

nb_metrics = {
    'valid': eval_nb(X_valid_vec, Y_valid, 'valid'),
    'test': eval_nb(X_test_vec, Y_test, 'test'),
}

In [None]:
# Optional: quick hyperparameter search (keep grid tiny to avoid long runs)
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline as SkPipeline

# Build a pipeline for search (vectorizer + OneVsRest NB)
search_pipe = SkPipeline(steps=[
    ('vec', TfidfVectorizer(stop_words='english', sublinear_tf=True)),
    ('clf', OneVsRestClassifier(MultinomialNB())),
])

param_grid = {
    'vec__ngram_range': [(1,1), (1,2)],
    'vec__max_features': [20000, 50000],
    'vec__min_df': [1, 2],
    'clf__estimator__alpha': [1.0, 0.5, 0.25],
}

# Use a very small CV for speed; scoring macro F1
search = GridSearchCV(search_pipe, param_grid, scoring='f1_macro', cv=3, n_jobs=-1, verbose=1)
search.fit(X_train_txt, Y_train)
print('Best params:', search.best_params_)
print('Best score:', search.best_score_)

# Refit a final model using best params (and then export vectorizer + model separately)
best_vec: TfidfVectorizer = TfidfVectorizer(stop_words='english', sublinear_tf=True,
                                            ngram_range=search.best_params_['vec__ngram_range'],
                                            max_features=search.best_params_['vec__max_features'],
                                            min_df=search.best_params_['vec__min_df'])
X_train_vec2 = best_vec.fit_transform(X_train_txt)
X_valid_vec2 = best_vec.transform(X_valid_txt)
X_test_vec2 = best_vec.transform(X_test_txt)

nb2 = OneVsRestClassifier(MultinomialNB(alpha=search.best_params_['clf__estimator__alpha']))
nb2.fit(X_train_vec2, Y_train)

print('Re-evaluating with best params:')
_ = eval_nb(X_valid_vec2, Y_valid, 'valid(best)')
_ = eval_nb(X_test_vec2, Y_test, 'test(best)')

In [None]:
# Export NB model and vectorizer
import joblib as _joblib

# Choose the best refit if available, else the first one
vec_to_save = best_vec if 'best_vec' in globals() else vectorizer
nb_to_save = nb2 if 'nb2' in globals() else nb

# Save artifacts
vec_path = MODELS_DIR / 'nb_vectorizer.pkl'
nb_path = MODELS_DIR / 'nb_model.pkl'
_joblib.dump(vec_to_save, vec_path)
_joblib.dump(nb_to_save, nb_path)

# Save schema
tags_schema = {
    'classes': list(mlb.classes_),
    'vocab_size': int(getattr(vec_to_save, 'vocabulary_', {}) and len(vec_to_save.vocabulary_) or 0),
}
with open(MODELS_DIR / 'tags_schema.json', 'w', encoding='utf-8') as f:
    json.dump(tags_schema, f, indent=2)
print('Saved NB artifacts to', MODELS_DIR)

In [None]:
# Version parity check: write out colab_versions.json (compare manually to server)
def get_versions():
    out = {'python': sys.version.split()[0], 'platform': platform.platform()}
    try:
        import sklearn; out['scikit_learn'] = sklearn.__version__
    except Exception:
        out['scikit_learn'] = None
    try:
        import numpy as _np; out['numpy'] = _np.__version__
    except Exception:
        out['numpy'] = None
    try:
        import pandas as _pd; out['pandas'] = _pd.__version__
    except Exception:
        out['pandas'] = None
    try:
        import scipy as _sc; out['scipy'] = _sc.__version__
    except Exception:
        out['scipy'] = None
    try:
        import torch as _th; out['torch'] = _th.__version__
    except Exception:
        out['torch'] = None
    return out

colab_versions = get_versions()
with open(MODELS_DIR / 'colab_versions.json', 'w', encoding='utf-8') as f:
    json.dump(colab_versions, f, indent=2)
print('colab_versions:', colab_versions)

In [None]:
# Export steps and artifact hashing
def sha256sum(p: Path) -> str:
    h = hashlib.sha256()
    with open(p, 'rb') as f:
        for chunk in iter(lambda: f.read(8192), b''):
            h.update(chunk)
    return h.hexdigest()

artifacts = [
    MODELS_DIR / 'ann_pipeline.pkl',
    MODELS_DIR / 'ann_features.json',
    MODELS_DIR / 'ann_metrics.json',
    MODELS_DIR / 'nb_vectorizer.pkl',
    MODELS_DIR / 'nb_model.pkl',
    MODELS_DIR / 'tags_schema.json',
    MODELS_DIR / 'colab_versions.json',
]

checksum_path = MODELS_DIR / 'checksums.sha256'
with open(checksum_path, 'w', encoding='utf-8') as f:
    for p in artifacts:
        if p.exists():
            s = sha256sum(p)
            f.write(f"{s}  {p.name}\n")
print('Wrote checksums to', checksum_path)

In [None]:
# VS Code-style sanity tests (assertions)
# 1) No double-encoding
assert len(set(ann_features)) == len(ann_features), 'Duplicate in ann_features'

# 2) ANN pipeline predict_proba shape
proba_shape = ann_pipeline.predict_proba(X_valid_ann).shape
assert proba_shape[1] == 2, f'Expected binary proba, got shape {proba_shape}'

# 3) Vectorizer reproducibility
vec2 = TfidfVectorizer(ngram_range=vec_to_save.ngram_range, stop_words='english', max_features=vec_to_save.max_features, min_df=vec_to_save.min_df, max_df=vec_to_save.max_df, sublinear_tf=True)
vec2.fit(X_train_txt)
assert vec2.get_feature_names_out().shape == vec_to_save.get_feature_names_out().shape, 'Vectorizer feature size mismatch'
print('All sanity tests passed.')

## How to run

1. Place your CSVs (optional) under `server/notebooks/data/`:
   - `ann_training.csv` with columns: Activity_Level, Has_Kids, Pet_Energy_Level, Pet_Good_With_Kids, Experience_Level, Pet_Size, Pet_Grooming_Needs, Match_Label
   - `nb_training.csv` with columns: description, tags (comma-separated)
   If not provided, the notebook will create small synthetic datasets for demonstration.

2. Ensure versions match the server (recommended): scikit-learn==1.6.1
   You can install inside the notebook if needed:
   - In a cell: `%pip install scikit-learn==1.6.1 pandas==2.2.3 numpy==2.1.3 scipy==1.14.1 joblib==1.4.2`

3. Run cells 1→end. Artifacts will be written to `server/models/`:
   - ann_pipeline.pkl, ann_features.json, ann_metrics.json
   - nb_vectorizer.pkl, nb_model.pkl, tags_schema.json
   - colab_versions.json, checksums.sha256

4. Restart the Flask server (if it’s running) so it reloads the new models.

Notes:
- ann_features.json is deliberately get_dummies-style (no ColumnTransformer prefixes) to stay compatible with the server’s `get_dummies` alignment.
- The ANN uses scikit-learn MLPClassifier to preserve the `predict_proba` contract and avoid extra wrappers.
- The NB pipeline expands to 1–2 grams and a larger vocab for richer tags; tune the grid if you have more data.
