In [34]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
import joblib
import os
import shutil

In [35]:
data_file = "../DATA/Final/encoded_parking_tickets.csv"
df = pd.read_csv(data_file)

# Features to use for modeling
features = df.columns.tolist()
features.remove('Year')
X_features = features.copy()

model_dir = "../OUTPUT/Final/models"
shutil.rmtree(model_dir)
os.makedirs(model_dir)

# split by time frame
def get_time_splits(df, start_year=2000, end_year=2023, chunk=5):
    splits = {}
    
    # All-time
    splits['all_time'] = df[df['Year'] <= end_year]
    splits['recent'] = df[df['Year'] == 2023]
    
    # 5-year chunks (descending)
    for y_end in range(end_year, start_year - 1, -chunk):
        y_start = max(y_end - chunk + 1, start_year)
        label = f"{y_start}-{y_end}"
        df_split = df[(df['Year'] >= y_start) & (df['Year'] <= y_end)]
        if len(df_split) > 2500:
            splits[label] = df_split # don't want small datasets for training
    
    # Test set: 2024
    test_df = df[df['Year'] == 2024]
    
    return splits, test_df

splits, test_df = get_time_splits(df, start_year=df['Year'].min(), end_year=df['Year'].max())

In [36]:
# train on each split
for label, split_df in splits.items():

    if split_df.empty:
        print(f"Skipping {label} — no data.")
        continue

    print(f"Training Isolation Forest for {label} ({len(split_df)} rows)...")
    
    X = split_df[X_features]

    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Train model
    model = IsolationForest(
        n_estimators=100,
        contamination=0.05,
        random_state=42
    )
    model.fit(X_scaled)
    
    # Save both model and scaler
    model_file = f"{model_dir}/isolation_forest_{label}.joblib"
    scaler_file = f"{model_dir}/scaler_{label}.joblib"
    joblib.dump(model, model_file)
    joblib.dump(scaler, scaler_file)
    
    print(f"Saved model to {model_file}")
    print(f"Saved scaler to {scaler_file}")

Training Isolation Forest for all_time (160454 rows)...
Saved model to ../OUTPUT/Final/models/isolation_forest_all_time.joblib
Saved scaler to ../OUTPUT/Final/models/scaler_all_time.joblib
Training Isolation Forest for recent (15555 rows)...
Saved model to ../OUTPUT/Final/models/isolation_forest_recent.joblib
Saved scaler to ../OUTPUT/Final/models/scaler_recent.joblib
Training Isolation Forest for 2020-2024 (52039 rows)...
Saved model to ../OUTPUT/Final/models/isolation_forest_2020-2024.joblib
Saved scaler to ../OUTPUT/Final/models/scaler_2020-2024.joblib
Training Isolation Forest for 2015-2019 (80344 rows)...
Saved model to ../OUTPUT/Final/models/isolation_forest_2015-2019.joblib
Saved scaler to ../OUTPUT/Final/models/scaler_2015-2019.joblib
Training Isolation Forest for 2010-2014 (28020 rows)...
Saved model to ../OUTPUT/Final/models/isolation_forest_2010-2014.joblib
Saved scaler to ../OUTPUT/Final/models/scaler_2010-2014.joblib
