# Inputs

In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from pathlib import Path
from sklearn.kernel_approximation import Nystroem
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from category_encoders import TargetEncoder
import xgboost as xgb
import mlflow
import mlflow.pyfunc
from sklearn.metrics import recall_score, precision_score, roc_auc_score
import joblib

In [None]:
input_path = "../data/platinum/"
output_path = "../data/platinum/"

Path(output_path).mkdir(parents=True, exist_ok=True)

In [None]:
xgb_model = joblib.load(input_path + "xgb/model.pkl")
svm_model = joblib.load(input_path + "svm/model.pkl")
lstm_model = joblib.load(input_path + "lstm/model.pkl")

# Processing

In [4]:
train_df, test_df = train_test_split(all_data, test_size=0.10, random_state=42, stratify=all_data['anomaly'])

# Verify the results
print(f"Total rows: {len(all_data)}")
print(f"Training rows: {len(train_df)} (90%)")
print(f"Testing rows: {len(test_df)} (10%)")

Total rows: 1749494
Training rows: 1574544 (90%)
Testing rows: 174950 (10%)


In [5]:
df_anomaly = train_df[train_df['anomaly'] == 1] # The 40,000 anomalies
df_normal = train_df[train_df['anomaly'] == 0]  # The 1.7M normal cases

# 2. Calculate the required number of normal samples for a 70/30 split
# If 40k is 30%, then X is 70% -> (40,000 / 0.3) * 0.7
num_normal_needed = int((len(df_anomaly) / 0.3) * 0.7)

# 3. Downsample Class B (Normal)
df_normal_downsampled = df_normal.sample(n=num_normal_needed, random_state=42)

# 4. Combine them into your new training set
df_train_7030 = pd.concat([df_anomaly, df_normal_downsampled])

# 5. Shuffle the resulting dataframe
df_train_7030 = df_train_7030.sample(frac=1, random_state=42).reset_index(drop=True)

# Verification
print(f"New Training Set Size: {len(df_train_7030)}")
print(f"Class 1 (Anomaly): {len(df_anomaly)} ({len(df_anomaly)/len(df_train_7030):.1%})")
print(f"Class B (Normal): {len(df_normal_downsampled)} ({len(df_normal_downsampled)/len(df_train_7030):.1%})")

New Training Set Size: 111886
Class 1 (Anomaly): 33566 (30.0%)
Class B (Normal): 78320 (70.0%)


## SVM

In [13]:
train_svm = df_train_7030.dropna(inplace=False)
test_svm = test_df.dropna(inplace=False)
print(f"% of records droped for having NaN values: {(1 - (len(train_svm) / len(df_train_7030))) * 100:.2f}%")

% of records droped for having NaN values: 0.11%


In [None]:
# 1. Prepare the data (handling the numeric selection as you did)
X = train_svm.select_dtypes(include=['int64', 'float64']).drop(['anomaly', 'timestamp'], axis=1, errors='ignore')
y = train_svm['anomaly']

# 2. Define the Nystroem transformer 
# n_components=300 is a good balance between speed and accuracy. 
# Increasing this improves accuracy but slows down training.
nystroem_stage = Nystroem(kernel='poly', degree=3, coef0=1, n_components=300, random_state=42)

# 3. Use a fast linear solver
# SGDClassifier is optimized for large datasets (100k+ rows)
clf_stage = SGDClassifier(
    loss='hinge', 
    class_weight='balanced',  # <--- THE MAGIC FIX
    alpha=0.01, 
    max_iter=1000, 
    tol=1e-3,
    random_state=42
)

# 2. Re-build the same pipeline structure
poly_svm_balanced = Pipeline([
    ('scaler', preprocessing.StandardScaler()),
    ('nystroem', nystroem_stage),
    ('clf', clf_stage)
])

# 3. Re-train (This will still be fast!)
poly_svm_balanced.fit(X, y)

# 4. test
X_test = test_svm.select_dtypes(include=['int64', 'float64']).drop(['anomaly'], axis=1, errors='ignore')
y_test = test_svm['anomaly']
y_pred = poly_svm_balanced.predict(X_test)
print(classification_report(y_test, y_pred))

print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.4f}")

y_scores = poly_svm_balanced.decision_function(X_test)
print(f"ROC AUC Score: {roc_auc_score(y_test, y_scores):.4f}")

              precision    recall  f1-score   support

           0       0.98      0.79      0.88    171015
           1       0.04      0.44      0.08      3728

    accuracy                           0.78    174743
   macro avg       0.51      0.61      0.48    174743
weighted avg       0.96      0.78      0.86    174743

Accuracy Score: 0.7811
ROC AUC Score: 0.6544


## XG Boost

In [20]:
X_train = df_train_7030.drop(['anomaly','timestamp'], axis=1)
y_train = df_train_7030['anomaly']

X_test = test_df.drop(['anomaly','timestamp'], axis=1)
y_test = test_df['anomaly']

In [21]:
# model parameters
num_neg = (y_train == 0).sum()
num_pos = (y_train == 1).sum()
scale_weight = num_neg / num_pos

xgb_params = { \
    'n_estimators' : 100,
    'max_depth' : 6,
    'learning_rate' : 0.1,
    'scale_pos_weight' : scale_weight,
    'tree_method' : 'hist',      
    'enable_categorical' : True, 
    'random_state' : 42,
    'use_label_encoder' : False,
    'eval_metric' : 'logloss'}

In [22]:
# 1. Identify categorical columns
cat_cols = ['building_month', 'building_hour', 'building_week_day_hour', 'primary_use']

# 2. Instantiate and Fit the Encoder
# 'smoothing' helps when a category only appears a few times
encoder = TargetEncoder(cols=cat_cols, smoothing=1.0)

# 3. Transform your data
# This converts the strings into 'anomaly probability' floats
X_train_encoded = encoder.fit_transform(X_train, y_train)
X_test_encoded = encoder.transform(X_test)

num_neg = (y == 0).sum()
num_pos = (y == 1).sum()
scale_weight = num_neg / num_pos

xgb_model = xgb.XGBClassifier(
    n_estimators= xgb_params['n_estimators'],
    max_depth= xgb_params['max_depth'],
    learning_rate= xgb_params['learning_rate'],
    scale_pos_weight= xgb_params['scale_pos_weight'], # From our previous calculation
    tree_method= xgb_params['tree_method'],      
    enable_categorical= xgb_params['enable_categorical'], 
    random_state= xgb_params['random_state'],
    use_label_encoder= xgb_params['use_label_encoder'],
    eval_metric= xgb_params['eval_metric']
)

# 4. Fit the model
# On 100k rows, this should take anywhere from 10 to 60 seconds
xgb_model.fit(X_train_encoded, y_train)

print("Model Training Complete!")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Model Training Complete!


In [24]:
X_test_xgb = X_test_encoded
y_pred_xgb = xgb_model.predict(X_test_xgb)
y_probs_xgb = xgb_model.predict_proba(X_test_xgb)[:, 1]

# Output

## SVM

In [35]:
joblib.dump(poly_svm_balanced, output_path + "svm/model.pkl")

['../data/platinum/svm/model.pkl']

## XG-Boost

In [None]:
joblib.dump(xgb_model, output_path + "xgb/model.pkl")

['../data/platinum/xgb/model.pkl']

## LSTM