# Inputs

In [31]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from pathlib import Path
from sklearn.kernel_approximation import Nystroem
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from category_encoders import TargetEncoder
import xgboost as xgb
import mlflow
import mlflow.pyfunc
from sklearn.metrics import recall_score, precision_score, roc_auc_score
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [2]:
input_path = "../data/gold/"
output_path = "../data/platinum/models/"
output_path_shap = "../data/platinum/shap/"

Path(output_path).mkdir(parents=True, exist_ok=True)
Path(output_path_shap).mkdir(parents=True, exist_ok=True)

In [9]:
raw_data = pd.read_parquet(input_path + "anomaly_features.parquet", engine='pyarrow')
all_data = raw_data.dropna(inplace=False)
print(f"% of records droped for having NaN values: {(1 - (len(all_data) / len(raw_data))) * 100:.2f}%")

% of records droped for having NaN values: 0.12%


# Processing

In [10]:
train_df, test_df = train_test_split(all_data, test_size=0.10, random_state=42, stratify=all_data['anomaly'])

# Verify the results
print(f"Total rows: {len(all_data)}")
print(f"Training rows: {len(train_df)} (90%)")
print(f"Testing rows: {len(test_df)} (10%)")

Total rows: 1747348
Training rows: 1572613 (90%)
Testing rows: 174735 (10%)


In [11]:
df_anomaly = train_df[train_df['anomaly'] == 1] # The 40,000 anomalies
df_normal = train_df[train_df['anomaly'] == 0]  # The 1.7M normal cases

# 2. Calculate the required number of normal samples for a 70/30 split
# If 40k is 30%, then X is 70% -> (40,000 / 0.3) * 0.7
num_normal_needed = int((len(df_anomaly) / 0.3) * 0.7)

# 3. Downsample Class B (Normal)
df_normal_downsampled = df_normal.sample(n=num_normal_needed, random_state=42)

# 4. Combine them into your new training set
df_train_7030 = pd.concat([df_anomaly, df_normal_downsampled])

# 5. Shuffle the resulting dataframe
df_train_7030 = df_train_7030.sample(frac=1, random_state=42).reset_index(drop=True)

# Verification
print(f"New Training Set Size: {len(df_train_7030)}")
print(f"Class 1 (Anomaly): {len(df_anomaly)} ({len(df_anomaly)/len(df_train_7030):.1%})")
print(f"Class B (Normal): {len(df_normal_downsampled)} ({len(df_normal_downsampled)/len(df_train_7030):.1%})")

New Training Set Size: 111813
Class 1 (Anomaly): 33544 (30.0%)
Class B (Normal): 78269 (70.0%)


In [41]:
def prep_data_for_model(df, model_type, t_type='both', encoder = None, scaler = None):

    X = df.drop(['anomaly', 'timestamp'], axis=1, errors='ignore')
    y = df['anomaly']

    cat_cols = ['building_month', 'building_hour', 'building_week_day_hour', 'primary_use']
    
    if model_type == "svm":
        X = df.select_dtypes(include=['int64', 'float64'])
        return X, y
    
    elif model_type == "xgboost":
        if t_type == "train":
            encoder = TargetEncoder(cols=cat_cols, smoothing=1.0)
            X_encoded = encoder.fit_transform(X, y)
            return X_encoded, y, encoder
        elif t_type == "test":
            X_encoded = encoder.transform(X)
            return X_encoded, y
        
    elif model_type == "knn":
        if t_type == "train":
            encoder = TargetEncoder(cols=cat_cols, smoothing=1.0)
            scaler = StandardScaler()
            X_encoded = encoder.fit_transform(X, y)
            X_scaled = scaler.fit_transform(X_encoded)
            return X_scaled, y, encoder, scaler, X.columns.tolist()
        elif t_type == "test":
            X_encoded = encoder.transform(X)
            X_scaled = scaler.transform(X_encoded)
            return X_scaled, y
    else:
        raise ValueError("Unsupported model type. Choose 'svm', 'xgboost', or 'knn'.")



## SVM

In [14]:
# 1. Prepare the data (handling the numeric selection as you did)
X_train_svm, y_train_svm = prep_data_for_model(df_train_7030, "svm")
X_test_svm, y_test_svm = prep_data_for_model(test_df, "svm")

# 2. Define the Nystroem transformer 
# n_components=300 is a good balance between speed and accuracy. 
# Increasing this improves accuracy but slows down training.
nystroem_stage = Nystroem(kernel='poly', degree=3, coef0=1, n_components=300, random_state=42)

# 3. Use a fast linear solver
# SGDClassifier is optimized for large datasets (100k+ rows)
clf_stage = SGDClassifier(
    loss='hinge', 
    class_weight='balanced',  # <--- THE MAGIC FIX
    alpha=0.01, 
    max_iter=1000, 
    tol=1e-3,
    random_state=42
)

# 2. Re-build the same pipeline structure
poly_svm_balanced = Pipeline([
    ('scaler', preprocessing.StandardScaler()),
    ('nystroem', nystroem_stage),
    ('clf', clf_stage)
])

# 3. Re-train (This will still be fast!)
poly_svm_balanced.fit(X_train_svm, y_train_svm)

# 4. test
y_pred = poly_svm_balanced.predict(X_test_svm)
print(classification_report(y_test_svm, y_pred))

print(f"Accuracy Score: {accuracy_score(y_test_svm, y_pred):.4f}")

y_scores = poly_svm_balanced.decision_function(X_test_svm)
print(f"ROC AUC Score: {roc_auc_score(y_test_svm, y_scores):.4f}")

              precision    recall  f1-score   support

           0       0.98      0.79      0.88    171008
           1       0.04      0.44      0.08      3727

    accuracy                           0.78    174735
   macro avg       0.51      0.61      0.48    174735
weighted avg       0.96      0.78      0.86    174735

Accuracy Score: 0.7809
ROC AUC Score: 0.6566


## XG Boost

In [16]:
X_train_xgb, y_train_xgb, encoder_xgb = prep_data_for_model(df_train_7030, "xgboost", 'train')
X_test_xgb, y_test_xgb = prep_data_for_model(test_df, "xgboost", 'test', encoder_xgb)

In [17]:
# model parameters
num_neg = (y_train_xgb == 0).sum()
num_pos = (y_train_xgb == 1).sum()
scale_weight = num_neg / num_pos

xgb_params = { \
    'n_estimators' : 100,
    'max_depth' : 6,
    'learning_rate' : 0.1,
    'scale_pos_weight' : scale_weight,
    'tree_method' : 'hist',      
    'enable_categorical' : True, 
    'random_state' : 42,
    'use_label_encoder' : False,
    'eval_metric' : 'logloss'}

In [19]:
xgb_model = xgb.XGBClassifier(
    n_estimators= xgb_params['n_estimators'],
    max_depth= xgb_params['max_depth'],
    learning_rate= xgb_params['learning_rate'],
    scale_pos_weight= xgb_params['scale_pos_weight'], # From our previous calculation
    tree_method= xgb_params['tree_method'],      
    enable_categorical= xgb_params['enable_categorical'], 
    random_state= xgb_params['random_state'],
    use_label_encoder= xgb_params['use_label_encoder'],
    eval_metric= xgb_params['eval_metric']
)

# 4. Fit the model
# On 100k rows, this should take anywhere from 10 to 60 seconds
xgb_model.fit(X_train_xgb, y_train_xgb)

print("Model Training Complete!")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Model Training Complete!


In [20]:
X_test_xgb = X_test_xgb
y_pred_xgb = xgb_model.predict(X_test_xgb)
y_probs_xgb = xgb_model.predict_proba(X_test_xgb)[:, 1]

## KNN

In [42]:
X_train_knn, y_train_knn, encoder_knn, scaler_knn, knn_cols = prep_data_for_model(df_train_7030, "knn", 'train')
X_test_knn, y_test_knn = prep_data_for_model(test_df, "knn", 'test', encoder_knn, scaler_knn)

In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

# 4. Initialize and train the model
# Using k=5 as a starting point
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_knn, y_train_knn)

# 5. Predict and Evaluate
y_pred = knn.predict(X_test_knn)

print(f"Accuracy: {accuracy_score(y_test_knn, y_pred):.2f}")
print("\nClassification Report:")
print(classification_report(y_test_knn, y_pred))

Accuracy: 0.89

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.89      0.94    171008
           1       0.14      0.82      0.24      3727

    accuracy                           0.89    174735
   macro avg       0.57      0.85      0.59    174735
weighted avg       0.98      0.89      0.93    174735



# Output

## SVM

In [36]:
x_train_svm_path_parquet = os.path.join(output_path_shap, 'svm/x_train.parquet')
y_train_svm_path_parquet = os.path.join(output_path_shap, 'svm/y_train.parquet')
x_test_svm_path_parquet = os.path.join(output_path_shap, 'svm/x_test.parquet')
y_test_svm_path_parquet = os.path.join(output_path_shap, 'svm/y_test.parquet')

joblib.dump(poly_svm_balanced, output_path + "svm/model.pkl")

X_train_svm.to_parquet(x_train_svm_path_parquet, index=False)
y_train_svm.to_frame(name='anomaly').to_parquet(y_train_svm_path_parquet, index=False)
X_test_svm.to_parquet(x_test_svm_path_parquet, index=False)
y_test_svm.to_frame(name='anomaly').to_parquet(y_test_svm_path_parquet, index=False)

## XG-Boost

In [37]:
x_train_xgb_path_parquet = os.path.join(output_path_shap, 'xgb/x_train.parquet')
y_train_xgb_path_parquet = os.path.join(output_path_shap, 'xgb/y_train.parquet')
x_test_xgb_path_parquet = os.path.join(output_path_shap, 'xgb/x_test.parquet')
y_test_xgb_path_parquet = os.path.join(output_path_shap, 'xgb/y_test.parquet')

joblib.dump(xgb_model, output_path + "xgb/model.pkl")

X_train_xgb.to_parquet(x_train_xgb_path_parquet, index=False)
y_train_xgb.to_frame(name='anomaly').to_parquet(y_train_xgb_path_parquet, index=False)
X_test_xgb.to_parquet(x_test_xgb_path_parquet, index=False)
y_test_xgb.to_frame(name='anomaly').to_parquet(y_test_xgb_path_parquet, index=False)

## KNN

In [43]:
x_train_knn_path_parquet = os.path.join(output_path_shap, 'knn/x_train.parquet')
y_train_knn_path_parquet = os.path.join(output_path_shap, 'knn/y_train.parquet')
x_test_knn_path_parquet = os.path.join(output_path_shap, 'knn/x_test.parquet')
y_test_knn_path_parquet = os.path.join(output_path_shap, 'knn/y_test.parquet')

joblib.dump(knn, output_path + "knn/model.pkl")

X_train_knn_df = pd.DataFrame(X_train_knn, columns = knn_cols )
X_train_knn_df.to_parquet(x_train_knn_path_parquet, index=False)

y_train_knn_df = pd.DataFrame(y_train_knn, columns=['anomaly'])
y_train_knn_df.to_parquet(y_train_knn_path_parquet, index=False)

X_test_knn_df = pd.DataFrame(X_test_knn, columns = knn_cols)
X_test_knn_df.to_parquet(x_test_knn_path_parquet, index=False)

y_test_knn_df = pd.DataFrame(y_test_knn, columns=['anomaly'])
y_test_knn_df.to_parquet(y_test_knn_path_parquet, index=False)