# Baseline multi-output classifier

This notebook trains a simple baseline MultiOutput RandomForest classifier on the spectra dataset created by `src.preprocessing.loader`.
It reports common multi-output/multi-label metrics and saves the trained model and a metrics JSON to `src/models/`.

Notes:
- Expects the repository root to include `src/` and the raw data under `data/raw/.../ATR set 1_washed`.
- Uses spectra-based dataset (`prepare_ml_dataset_spectra`) from the loader module.
- Adjust `TEST_SIZE` and random seeds as needed for reproducibility.

In [1]:
# Imports and repo-root discovery
import sys
from pathlib import Path
import importlib
import json

# find repo root (same approach as other notebooks)
cwd = Path.cwd()
repo_root = None
for p in [cwd] + list(cwd.parents):
    if (p / 'src').exists() or (p / '.git').exists():
        repo_root = p
        break
if repo_root is None:
    repo_root = cwd
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))
print('Repo root:', repo_root)

# ML imports
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, hamming_loss, f1_score, classification_report, precision_recall_fscore_support
import joblib

# import loader and reload to pick up edits during development
import src.preprocessing.loader as loader_mod
importlib.reload(loader_mod)

Repo root: c:\Users\Mikey\Documents\GitHub\cmse492_project


<module 'src.preprocessing.loader' from 'c:\\Users\\Mikey\\Documents\\GitHub\\cmse492_project\\src\\preprocessing\\loader.py'>

In [2]:
# Load spectra-based dataset (X: n_samples x n_wavelengths, y: n_samples x n_targets)
data_root = repo_root / 'data' / 'raw' / 'Plastic Washing CMSE project CSV files' / 'ATR set 1_washed'
print('Data folder:', data_root)
X, y, feature_names, target_names = loader_mod.prepare_ml_dataset_spectra(data_root)
print('X shape:', X.shape if hasattr(X, 'shape') else None)
print('y shape:', y.shape if hasattr(y, 'shape') else None)

# Basic sanity checks
if X.size == 0 or y.size == 0:
    raise RuntimeError('No data loaded. Check data_root path and loader implementation.')

Data folder: c:\Users\Mikey\Documents\GitHub\cmse492_project\data\raw\Plastic Washing CMSE project CSV files\ATR set 1_washed
X shape: (122, 1868)
y shape: (122, 9)
X shape: (122, 1868)
y shape: (122, 9)


In [3]:
# Prepare train/test split
TEST_SIZE = 0.2
RANDOM_STATE = 42

# attempt to stratify by polymer class (first 4 one-hot targets) when possible
stratify_labels = None
if y.shape[1] >= 4:
    try:
        stratify_labels = np.argmax(y[:, :4], axis=1)
        # if stratify has at least two classes, use it, else fallback
        if len(np.unique(stratify_labels)) < 2:
            stratify_labels = None
    except Exception:
        stratify_labels = None

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=stratify_labels
)
print('Train/test sizes:', X_train.shape[0], X_test.shape[0])

Train/test sizes: 97 25


In [4]:
# Instantiate and train a MultiOutput RandomForest baseline
clf = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=RANDOM_STATE))
print('Fitting classifier...')
clf.fit(X_train, y_train)
print('Done training')

Fitting classifier...
Done training
Done training


In [7]:
# Evaluate on test set
y_pred = clf.predict(X_test)

metrics = {}
# Exact-match accuracy (all targets correct)
metrics['exact_match_accuracy'] = float(accuracy_score(y_test, y_pred))
# Hamming loss (fraction of incorrect labels)
metrics['hamming_loss'] = float(hamming_loss(y_test, y_pred))
# Macro F1 across all binary targets
try:
    metrics['f1_macro'] = float(f1_score(y_test, y_pred, average='macro', zero_division=0))
except Exception as e:
    metrics['f1_macro'] = None

print('Exact-match accuracy:', metrics['exact_match_accuracy'])
print('Hamming loss:', metrics['hamming_loss'])
print('Macro F1:', metrics['f1_macro'])

# Per-target precision/recall/f1/ support
per_target = {}
for i, tname in enumerate(target_names):
    y_true_i = y_test[:, i]
    y_pred_i = y_pred[:, i]
    p, r, f, s = precision_recall_fscore_support(y_true_i, y_pred_i, average='binary', zero_division=0)
    # support may be None in some sklearn versions/edge cases - compute from true labels as fallback
    support_count = int(np.sum(y_true_i)) if y_true_i is not None else 0
    per_target[tname] = {'precision': float(p), 'recall': float(r), 'f1': float(f), 'support': int(support_count)}

metrics['per_target'] = per_target

# Print a short per-target table
import pandas as pd
tbl = pd.DataFrame(per_target).T
print('\nPer-target metrics:')
print(tbl)

# Detailed classification reports per target (binary)
for i, tname in enumerate(target_names):
    print(f'-- Classification report for target: {tname} --')
    try:
        print(classification_report(y_test[:, i], y_pred[:, i], zero_division=0))
    except Exception as e:
        print('Could not produce classification_report for', tname, e)


Exact-match accuracy: 0.2
Hamming loss: 0.21333333333333335
Macro F1: 0.7238407760098764

Per-target metrics:
            precision    recall        f1  support
is_HDPE      0.800000  0.800000  0.800000      5.0
is_LDPE      1.000000  0.285714  0.444444      7.0
is_LLDPE     0.875000  0.875000  0.875000      8.0
is_PP        0.833333  1.000000  0.909091      5.0
has_BSA      0.625000  0.666667  0.645161     15.0
has_OIL      0.916667  0.916667  0.916667     12.0
has_GUAR     1.000000  0.416667  0.588235     12.0
has_CMC      0.705882  0.750000  0.727273     16.0
has_STARCH   0.636364  0.583333  0.608696     12.0
-- Classification report for target: is_HDPE --
              precision    recall  f1-score   support

         0.0       0.95      0.95      0.95        20
         1.0       0.80      0.80      0.80         5

    accuracy                           0.92        25
   macro avg       0.88      0.88      0.88        25
weighted avg       0.92      0.92      0.92        25

-- Cl

In [8]:
# Save model and metrics to disk under src/models/
out_dir = repo_root / 'src' / 'models'
out_dir.mkdir(parents=True, exist_ok=True)
model_path = out_dir / 'baseline_multioutput_rf.joblib'
metrics_path = out_dir / 'baseline_metrics.json'
joblib.dump(clf, model_path)
with open(metrics_path, 'w', encoding='utf8') as f:
    json.dump(metrics, f, indent=2)
print('Saved model to', model_path)
print('Saved metrics to', metrics_path)

Saved model to c:\Users\Mikey\Documents\GitHub\cmse492_project\src\models\baseline_multioutput_rf.joblib
Saved metrics to c:\Users\Mikey\Documents\GitHub\cmse492_project\src\models\baseline_metrics.json


## Next steps
- Consider cross-validation and hyperparameter search (GridSearchCV) for the RandomForest.
- Try simpler linear models or tree-based feature importance to inspect which wavelengths matter.
- Add a small unit test that asserts `baseline_multioutput_rf.joblib` and `baseline_metrics.json` exist after running.

In [9]:
# Save evaluation results to an Excel workbook (summary + per-target)
from pathlib import Path
import pandas as pd

out_dir = repo_root / 'src' / 'models'
out_dir.mkdir(parents=True, exist_ok=True)
xlsx_path = out_dir / 'baseline_metrics.xlsx'

# Prepare summary dataframe
summary = {
    'exact_match_accuracy': metrics.get('exact_match_accuracy'),
    'hamming_loss': metrics.get('hamming_loss'),
    'f1_macro': metrics.get('f1_macro')
}
summary_df = pd.DataFrame([summary])

# Per-target dataframe: prefer the `tbl` DataFrame if present, else construct from metrics
if 'per_target' in metrics and metrics['per_target']:
    per_target_df = pd.DataFrame(metrics['per_target']).T
else:
    try:
        per_target_df = tbl.copy()
    except NameError:
        per_target_df = pd.DataFrame()

# Write to Excel with two sheets
try:
    with pd.ExcelWriter(xlsx_path, engine='openpyxl') as writer:
        summary_df.to_excel(writer, sheet_name='summary', index=False)
        per_target_df.to_excel(writer, sheet_name='per_target')
    print('Saved evaluation workbook to', xlsx_path)
except Exception as e:
    # fallback: try without specifying engine
    with pd.ExcelWriter(xlsx_path) as writer:
        summary_df.to_excel(writer, sheet_name='summary', index=False)
        per_target_df.to_excel(writer, sheet_name='per_target')
    print('Saved evaluation workbook to', xlsx_path, '(fallback writer used)')


Saved evaluation workbook to c:\Users\Mikey\Documents\GitHub\cmse492_project\src\models\baseline_metrics.xlsx
