# Baselining: Claim Frequency -> Claim Occurrence (classification)

This notebook converts a numeric `ClaimFrequency` target into a binary `HasClaim` target
(0 = no claim, 1 = at least one claim) and runs a small set of baseline classifiers to
benchmark performance using accuracy, F1 and AUC.

In [1]:
# Cell 1: core imports and config
import pandas as pd
import numpy as np
from pathlib import Path
import logging

# reproducibility & logging
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
logging.getLogger().setLevel(logging.INFO)

print('imports OK')


imports OK


In [2]:
# Standard Library Imports

# TabPFN and Extensions

try:
    from tabpfn_extensions.post_hoc_ensembles.sklearn_interface import (
        AutoTabPFNClassifier,
    )

    from tabpfn import TabPFNClassifier, TabPFNRegressor
except ImportError:
    raise ImportError(
        "Warning: Could not import TabPFN / TabPFN extensions. Please run installation above and restart the session afterwards (Runtime > Restart Session)."
    )

# Data Science & Visualization
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch

# Other ML Models
from catboost import CatBoostClassifier, CatBoostRegressor

# Notebook UI/Display
from IPython.display import Markdown, display
from rich.console import Console
from rich.panel import Panel
from rich.prompt import Prompt
from rich.rule import Rule
from sklearn.compose import make_column_selector, make_column_transformer

# Scikit-Learn: Data & Preprocessing
from sklearn.datasets import fetch_openml, load_breast_cancer

# Scikit-Learn: Models
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.model_selection import (
    KFold,
    StratifiedKFold,
    cross_val_score,
    train_test_split,
)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from xgboost import XGBClassifier, XGBRegressor

# This transformer will be used to handle categorical features for the baseline models
column_transformer = make_column_transformer(
    (
        OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
        make_column_selector(dtype_include=["object", "category"]),
    ),
    remainder="passthrough",
)

INFO:root:Using TabPFN package
INFO:root:Using TabPFN package
  import pkg_resources


In [3]:
console = Console()

console.print(Panel.fit("[bold magenta]TabPFN Demo: Backend Selection[/bold magenta]"))
console.print("\nThis script can run TabPFN using one of two backends:")
console.print("  [bold]1. local:[/bold] Uses a local GPU (NVIDIA). Requires CUDA.")
console.print(
    "  [bold]2. client:[/bold] Uses the TabPFN API. Requires an internet connection and a free account."
)

backend = Prompt.ask(
    "\n[bold]Choose your backend[/bold] - If not field to enter is shown restart the cell.",
    choices=["client", "local"],
    default="client",
)

console.print(
    f"\n✅ You have selected the '[bold green]{backend}[/bold green]' backend."
)

console.print(Rule(f"[bold]Setting up [cyan]{backend}[/cyan] backend[/bold]"))

if backend == "local":
    console.print("Attempting local backend setup...")
    import torch

    if not torch.cuda.is_available():
        console.print(
            "[bold red]Error:[/bold red] GPU device not found. For fast training, please enable GPU.",
            style="red",
        )
        console.print(
            "In Colab: Go to [bold]Runtime -> Change runtime type -> Hardware accelerator -> GPU.[/bold]",
            style="yellow",
        )
        raise SystemError("GPU device not found.")
    console.print(
        "[bold green]✅ GPU is available.[/bold green] Importing local TabPFN library..."
    )
    from tabpfn import TabPFNClassifier, TabPFNRegressor

    console.print("[bold green]✅ TabPFN (local) imported successfully.[/bold green]")
elif backend == "client":
    console.print("Attempting client backend setup...")
    console.print("Importing TabPFN client library...")
    from tabpfn_client import TabPFNClassifier, TabPFNRegressor, init

    init()
    console.print("[bold green]✅ TabPFN (client) initialized.[/bold green]")

  Welcome Back! Found existing access token, reusing it for authentication.


In [4]:
# Cell 2: ensure baselining directory is importable (robust)
import sys, os, importlib
from pathlib import Path

nb_cwd = Path.cwd()
# prefer a local `baselining/` sibling of the current working dir
candidate = nb_cwd / 'baselining'
if candidate.exists():
    p = str(candidate)
    if p not in sys.path:
        sys.path.insert(0, p)
        print('Inserted to sys.path:', p)
    else:
        print('Already in sys.path:', p)
else:
    print('Warning: expected baselining dir at', candidate)

# quick import smoke-test for required modules
for m in ('data_loader','model_training','evaluation_metrics'):
    try:
        importlib.import_module(m)
        print('importable:', m)
    except Exception as e:
        print('not importable:', m, '-', e)


importable: data_loader
importable: model_training
importable: evaluation_metrics


In [5]:
# Cell 3: load and prepare classification data (sampled for faster iterations)
from pathlib import Path
import pandas as pd
from data_loader import preprocess_data   # keep preprocess helper if you want to reuse it

csv_path = Path("/Users/Scott/Documents Data Science/ADSWP/TabPFN/ADSWP Project/baselining/data/freMTPL2freq.csv")
if not csv_path.exists():
    # try original path from notebook (handles spaces vs dots)
    csv_path = Path("/Users/Scott/Documents/Data Science/ADSWP/TabPFN/ADSWP Project/baselining/data/freMTPL2freq.csv")
if not csv_path.exists():
    raise FileNotFoundError(f"CSV not found: {csv_path}")

df = pd.read_csv(csv_path)

# Sampling options
sample_frac = None   # not used when sample_n is set
sample_n = 500       # set to None to use full data

# Robustly select a claim-count column
candidate_names = [
    "ClaimFrequency", "ClaimNb", "NumberOfClaims", "NumberOfClaims",
    "Claims", "Claim_Number", "ClaimNumber", "ClaimCount"
]
target_col = next((c for c in candidate_names if c in df.columns), None)

if target_col is None:
    # fallback: any column containing 'claim' (case-insensitive)
    matches = [c for c in df.columns if "claim" in c.lower()]
    if len(matches) == 1:
        target_col = matches[0]
    elif len(matches) > 1:
        # prefer frequency/number/count variants if present
        for pref in ("freq", "nb", "number", "count"):
            found = next((m for m in matches if pref in m.lower()), None)
            if found:
                target_col = found
                break
        if target_col is None:
            target_col = matches[0]  # pick first if ambiguous

if target_col is None:
    raise ValueError(f"Could not find a claim-count column. Columns checked: {candidate_names} and any column containing 'claim'.")

# create binary target: HasClaim = 1 if claim-count > 0 else 0
df["HasClaim"] = (df[target_col] > 0).astype(int)

# take a sample for quicker iteration
if sample_n is not None:
    if sample_n < len(df):
        df = df.sample(n=sample_n, random_state=RANDOM_SEED).reset_index(drop=True)
else:
    if sample_frac is not None and 0 < sample_frac < 1:
        df = df.sample(frac=sample_frac, random_state=RANDOM_SEED).reset_index(drop=True)

print("Loaded CSV:", csv_path)
print("Using sample rows:", len(df), "target column:", target_col, "HasClaim distribution:", df["HasClaim"].value_counts().to_dict())

# split (reuse your existing preprocess_data)
X_train, X_test, y_train, y_test = preprocess_data(df, target="HasClaim")
print("Split done. X_train:", X_train.shape, "X_test:", X_test.shape)

Loaded CSV: /Users/Scott/Documents/Data Science/ADSWP/TabPFN/ADSWP Project/baselining/data/freMTPL2freq.csv
Using sample rows: 500 target column: ClaimNb HasClaim distribution: {0: 477, 1: 23}
Split done. X_train: (400, 12) X_test: (100, 12)


## Preprocessing pipeline

Define a ColumnTransformer for numeric and categorical features and a reusable `preprocessor` used by every model pipeline.


In [6]:
# Preprocessing pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# auto-detect feature types from X_train (assumes X_train exists)
numeric_cols = X_train.select_dtypes(include=['number']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

num_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

# construct OneHotEncoder in a version-compatible way
try:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
except TypeError:
    # newer sklearn uses `sparse_output` instead of `sparse`
    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

cat_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='__MISSING__')),
    ('ohe', ohe)
])

preprocessor = ColumnTransformer([
    ('num', num_pipe, numeric_cols),
    ('cat', cat_pipe, cat_cols)
], remainder='drop')

print('Preprocessor ready — numeric:', len(numeric_cols), 'cat:', len(cat_cols))


Preprocessor ready — numeric: 8 cat: 4


## Models and pipelines

Create a small set of pipelines (preprocessor + estimator). Keep `class_weight='balanced'` where available.


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline

models = {
    'logistic': Pipeline([('pre', preprocessor), ('clf', LogisticRegression(max_iter=200, class_weight='balanced', random_state=RANDOM_SEED))]),
    'random_forest': Pipeline([('pre', preprocessor), ('clf', RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=RANDOM_SEED))]),
    'grad_boost': Pipeline([('pre', preprocessor), ('clf', GradientBoostingClassifier(n_estimators=100, random_state=RANDOM_SEED))])
}
print('Defined models:', list(models.keys()))


Defined models: ['logistic', 'random_forest', 'grad_boost']


## TabPFN baseline (optional)

Try to load the TabPFN classifier, fit on the training set, and add its predictions to the `preds` DataFrame. This cell is defensive: it skips if TabPFN isn't installed or if pretraining limits would be exceeded (but it can be forced).

In [126]:
### Installation (run once outside the notebook)

#This notebook expects the usual ML packages to be available in the environment (scikit-learn, pandas, numpy, joblib, matplotlib) and optionally TabPFN if you want to run the TabPFN baseline.

#Run these commands in your terminal (not in the notebook) to prepare the environment:

```bash
# create / activate your venv, then:
python -m pip install -U pip
python -m pip install -r requirements.txt  # if you maintain one
# or at minimum:
python -m pip install scikit-learn pandas numpy joblib matplotlib
# optional: TabPFN (use a scikit-learn compatible version, e.g. sklearn==1.2.2)
python -m pip install "scikit-learn==1.2.2"
python -m pip install tabpfn tabpfn-client
```

If you use Colab or other managed runtimes you may prefer the full install steps in the TabPFN repo; the notebook will check availability and skip TabPFN if it is not importable.

SyntaxError: invalid syntax (221135227.py, line 7)

In [145]:
# Standard Library Imports

# TabPFN and Extensions (defensive import)
try:
    from tabpfn_extensions.post_hoc_ensembles.sklearn_interface import (
        AutoTabPFNClassifier,
    )
    from tabpfn import TabPFNClassifier, TabPFNRegressor
    tabpfn_available = True
    print('TabPFN import OK')
except Exception as e:
    tabpfn_available = False
    TabPFNClassifier = None
    TabPFNRegressor = None
    print('TabPFN not available - continuing without it. To enable TabPFN install it in your environment. Error:', e)

# Data Science & Visualization
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Optional: torch may not be available in all environments
try:
    import torch
except Exception:
    torch = None

# Other ML Models (optional imports)
try:
    from catboost import CatBoostClassifier, CatBoostRegressor
except Exception:
    CatBoostClassifier = None
    CatBoostRegressor = None

# Notebook UI/Display
from IPython.display import Markdown, display

# Useful helpers
from sklearn.compose import make_column_selector, make_column_transformer

# Scikit-Learn: Data & Preprocessing
from sklearn.datasets import fetch_openml, load_breast_cancer

# Scikit-Learn: Models
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.model_selection import (
    KFold,
    StratifiedKFold,
    cross_val_score,
    train_test_split,
)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

# Optional boosters
try:
    from xgboost import XGBClassifier, XGBRegressor
except Exception:
    XGBClassifier = None
    XGBRegressor = None

# This transformer will be used to handle categorical features for the baseline models
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_selector

column_transformer = make_column_transformer((
    OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
    make_column_selector(dtype_include=["object", "category"]),
), remainder="passthrough")


TabPFN import OK


In [10]:
# TabPFN integration cell (isolated) - uses fit_and_eval helper and extra sanity checks; does not modify global preds/results_df
if not globals().get('tabpfn_available', False):
    print('TabPFN not available in this environment; skipping TabPFN step.')
else:
    try:
        from tabpfn import TabPFNClassifier
        from tabpfn.utils import infer_categorical_features
    except Exception as e:
        print('TabPFN import failed at runtime:', e)
        TabPFNClassifier = None

    if TabPFNClassifier is None:
        print('TabPFN not available; skipping.')
    else:
        # local results and preds
        import pandas as _pd
        preds_tabpfn = None
        results_tabpfn = _pd.DataFrame()  # will construct row safely below

        try:
            try:
                cat_ix = infer_categorical_features(X_train)
                print('Inferred categorical feature indices using tabpfn helper:', cat_ix)
            except Exception:
                cat_cols = list(X_train.select_dtypes(include=['object', 'category']).columns)
                cat_ix = [list(X_train.columns).index(c) for c in cat_cols] if len(cat_cols) > 0 else None
                print('Fallback categorical indices:', cat_ix)

            try:
                tpfn_pipe = TabPFNClassifier(device='auto', model_path='auto', categorical_features_indices=cat_ix)
            except TypeError:
                try:
                    tpfn_pipe = TabPFNClassifier(device='auto', model_path='auto', categorical_features=cat_ix)
                except TypeError:
                    tpfn_pipe = TabPFNClassifier(device='auto', model_path='auto')

            # Try to fit and eval; allow numpy fallback
            try:
                res = fit_and_eval('tabpfn', tpfn_pipe)
            except Exception as e_fit:
                print('TabPFN fit with DataFrame inputs failed:', e_fit)
                tpfn_pipe = TabPFNClassifier(device='auto', model_path='auto')
                res = fit_and_eval('tabpfn', tpfn_pipe, X_train=X_train.values, X_test=X_test.values, y_train=y_train.values, y_test=y_test.values)
                print('Retried TabPFN with numpy arrays')

            # Sanity checks on outputs and robust assignment
            proba = res.get('proba')
            preds_bin = res.get('preds_bin')
            import numpy as _np

            # helper to align prediction arrays with X_test / y_test
            def _align_preds(arr, X_ref, y_ref=None):
                a = _np.asarray(arr).ravel()
                n_a = len(a)
                n_ref = len(X_ref)
                if n_a == n_ref:
                    idx = getattr(X_ref, 'index', None)
                    return _pd.Series(a, index=idx, name='tabpfn'), a, None
                # mismatch: trim to min length and warn
                min_len = min(n_a, n_ref)
                print(f'Warning: TabPFN produced {n_a} predictions but X_test has {n_ref} rows; trimming to {min_len} to align for metrics.')
                idx = (X_ref.index[:min_len] if hasattr(X_ref, 'index') else None)
                return _pd.Series(a[:min_len], index=idx, name='tabpfn'), a[:min_len], ('trimmed', n_a, n_ref)

            # defensive function to finalize metrics into a one-row DataFrame
            def _finalize_metrics(metrics_dict):
                return _pd.DataFrame([metrics_dict], index=['tabpfn'])

            n_test = len(X_test)
            MIN_PRED_FRACTION = 0.01  # require at least 1% of test size, or MIN_PRED_ABS
            MIN_PRED_ABS = 100
            min_expected = max(int(MIN_PRED_FRACTION * n_test), MIN_PRED_ABS)

            if proba is not None:
                preds_series, p_aligned, info = _align_preds(proba, X_test, y_test)
                preds_tabpfn = preds_series
                y_aligned = _np.asarray(y_test).ravel()
                if info is not None:
                    # trim y to same length
                    y_aligned = y_aligned[:len(p_aligned)]

                # if predictions are much shorter than expected, skip computing metrics to avoid misleading results
                if len(p_aligned) < min_expected:
                    print(f"TabPFN returned only {len(p_aligned)} preds vs {n_test} rows; below threshold ({min_expected}) — skipping metric computation and marking as NaN.")
                    metrics = {'accuracy': float('nan'), 'f1': float('nan'), 'roc_auc': float('nan'), 'pr_auc': float('nan')}
                    results_tabpfn = _finalize_metrics(metrics)
                else:
                    # check degenerate probabilities
                    if len(set(p_aligned)) <= 1:
                        print('Warning: TabPFN returned degenerate probability array (all values equal); metrics will be NaN.')
                        metrics = {'accuracy': float('nan'), 'f1': float('nan'), 'roc_auc': float('nan'), 'pr_auc': float('nan')}
                        results_tabpfn = _finalize_metrics(metrics)
                    else:
                        from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score
                        preds_bin_calc = (p_aligned >= 0.5).astype(int)
                        metrics = {
                            'accuracy': float(accuracy_score(y_aligned, preds_bin_calc)),
                            'f1': float(f1_score(y_aligned, preds_bin_calc)),
                            'roc_auc': float(roc_auc_score(y_aligned, p_aligned)) if len(set(p_aligned)) > 1 else float('nan'),
                            'pr_auc': float(average_precision_score(y_aligned, p_aligned)) if len(set(p_aligned)) > 1 else float('nan'),
                        }
                        results_tabpfn = _finalize_metrics(metrics)
            else:
                # hard labels path
                preds_series, p_aligned, info = _align_preds(preds_bin, X_test, y_test)
                preds_tabpfn = preds_series
                y_aligned = _np.asarray(y_test).ravel()
                if info is not None:
                    y_aligned = y_aligned[:len(p_aligned)]
                    print('Adjusted y_test length to match predictions for metric computation')

                # if predictions are much shorter than expected, skip
                if len(p_aligned) < min_expected:
                    print(f"TabPFN returned only {len(p_aligned)} preds vs {n_test} rows; below threshold ({min_expected}) — skipping metric computation and marking as NaN.")
                    metrics = {'accuracy': float('nan'), 'f1': float('nan'), 'roc_auc': float('nan'), 'pr_auc': float('nan')}
                    results_tabpfn = _finalize_metrics(metrics)
                else:
                    # check exact match to detect leakage
                    if _np.array_equal(_np.asarray(p_aligned).ravel(), y_aligned):
                        print('Warning: TabPFN hard-label predictions exactly match y_test — suspect leakage or mis-shape. Results marked as NaN.')
                        metrics = {'accuracy': float('nan'), 'f1': float('nan'), 'roc_auc': float('nan'), 'pr_auc': float('nan')}
                        results_tabpfn = _finalize_metrics(metrics)
                    else:
                        from sklearn.metrics import accuracy_score, f1_score
                        metrics = {
                            'accuracy': float(accuracy_score(y_aligned, p_aligned)),
                            'f1': float(f1_score(y_aligned, p_aligned)),
                            'roc_auc': float('nan'),
                            'pr_auc': float('nan')
                        }
                        results_tabpfn = _finalize_metrics(metrics)

            # Save artifacts only for TabPFN
            artifacts.mkdir(exist_ok=True)
            try:
                if preds_tabpfn is not None:
                    preds_tabpfn.to_csv(artifacts / 'tabpfn_preds.csv')
                results_tabpfn.to_csv(artifacts / 'tabpfn_results.csv')
                print('Saved TabPFN artifacts to', artifacts)
            except Exception as e_save:
                print('Could not save TabPFN artifacts:', e_save)

            print('TabPFN isolated metrics:')
            display(results_tabpfn)
            print('TabPFN preds head:')
            display((preds_tabpfn.head() if preds_tabpfn is not None else 'No preds'))

        except Exception as e:
            # helpful debug output
            print('TabPFN fit/predict failed:', e)
            try:
                print('res (debug):', {k: (type(v), getattr(v, 'shape', None), len(v) if hasattr(v, '__len__') else None) for k, v in (res.items() if isinstance(res, dict) else [])})
            except Exception:
                pass

TabPFN not available in this environment; skipping TabPFN step.
