In [30]:
import pandas as pd
import numpy as np
from sklearn.metrics import recall_score
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.mixture import GaussianMixture
import anomaly as a
from anomaly import DISCRETE_COLUMNS, CONTINUOUS_COLUMNS
import datetime

In [26]:
def introduce_anomalies(df):
    data_anomalous = df.copy()
    n_samples = df.shape[0]
    n_anomalies = 100
    anomaly_indices = np.random.choice(df.index, n_anomalies, replace=False)
    
    # Introduce anomalies in continuous features by adding large values
    continuous_cols = df.select_dtypes(include=[np.number]).columns
    for col in continuous_cols:
        data_anomalous.loc[anomaly_indices, col] += np.random.normal(1000, 5, size=n_anomalies)  
    
    # Introduce anomalies in discrete features by changing categories to rare ones
    discrete_cols = df.select_dtypes(exclude=[np.number]).columns
    for col in discrete_cols:
        unique_values = df[col].unique()
        anomalous_values = np.random.choice(unique_values, size=n_anomalies, replace=True)
        data_anomalous.loc[anomaly_indices, col] = anomalous_values
    
    # Create labels for the anomalies (1 = anomaly, 0 = normal)
    labels = np.zeros(n_samples)
    labels[anomaly_indices] = 1
    return data_anomalous, labels

In [93]:
# Load data and filter to one day of data
df = a.load_excel("final_data.xlsx")
df = df.loc[df["CreateDate"].dt.date == datetime.date(2024, 10, 28)]
df = df.reset_index(drop=True)

# Introduce anomalies
anomalous_df, true_labels = introduce_anomalies(df)
# Filter to relevant columns
all_columns = DISCRETE_COLUMNS + CONTINUOUS_COLUMNS
subset = anomalous_df[all_columns]

# One hot encoding for categorical features
features = pd.get_dummies(subset, columns=DISCRETE_COLUMNS)
# Scale continuous features
scaler = StandardScaler()
features[CONTINUOUS_COLUMNS] = scaler.fit_transform(features[CONTINUOUS_COLUMNS])

## Isolation Forest

In [None]:
model = IsolationForest(contamination=0.2, random_state=42)
model.fit(features)
# Predict anomalies
predictions = model.predict(features)
predictions = np.where(predictions == -1, 1, 0)

# Calculate recall
if_recall = recall_score(true_labels, predictions)
print(f"Model Recall: {if_recall:.2f}")

Model Recall: 0.88


## One-class SVM

In [95]:
model = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.01) 
model.fit(features)
# Predict anomalies
predictions = model.predict(features)
predictions = np.where(predictions == -1, 1, 0)

# Calculate recall
ocsvm_recall = recall_score(true_labels, predictions)
print(f"Model Recall: {ocsvm_recall:.2f}")

Model Recall: 1.00


## Gaussian Mixture Model

In [96]:
model = GaussianMixture(n_components=3, covariance_type='full', random_state=42)
model.fit(features)
# Predict anomalies
log_likelihood = model.score_samples(features)
threshold = np.percentile(log_likelihood, 5)
# Identify anomalies
predictions = np.where(log_likelihood < threshold, 1, 0)

# Calculate recall
gmm_recall = recall_score(true_labels, predictions)
print(f"Model Recall: {gmm_recall:.2f}")

Model Recall: 1.00


In [None]:
# Show results
print(f"Model Recall for Isolation Forest: {if_recall:.2f}")
print(f"Model Recall for One-Class SVM: {ocsvm_recall:.2f}")
print(f"Model Recall for Gaussian Mixture Model: {gmm_recall:.2f}")

Model Recall for Isolation Forest: 0.88
Model Recall for One-Class SVM: 1.00
Model Recall for Gaussian Mixture Model: 1.00
