In [138]:
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import requests
import pickle
from datetime import datetime
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler,OneHotEncoder

In [139]:
def generate_synthetic_anomalies(normal_data, n_anomalies=None, method='gaussian'):
    """
    Generate synthetic anomalies using different methods
    
    Parameters:
    normal_data: numpy array or pandas DataFrame of normal data
    n_anomalies: number of anomalies to generate (default: 10% of normal data)
    method: 'gaussian' or 'uniform' or 'extreme'
    """
    # Convert to numpy array if it's a DataFrame
    if isinstance(normal_data, pd.DataFrame):
        normal_data = normal_data.to_numpy()
        
    if n_anomalies is None:
        n_anomalies = len(normal_data) // 10
    
    mean = np.mean(normal_data, axis=0)
    std = np.std(normal_data, axis=0)
    min_vals = np.min(normal_data, axis=0)
    max_vals = np.max(normal_data, axis=0)
    
    if method == 'gaussian':
        # Generate anomalies from a wider Gaussian distribution
        synthetic_anomalies = np.random.normal(
            mean, 
            3 * std,  # 3 times the standard deviation
            size=(n_anomalies, normal_data.shape[1])
        )
    elif method == 'uniform':
        # Generate anomalies uniformly outside the normal range
        synthetic_anomalies = np.random.uniform(
            low=min_vals - 2 * std,
            high=max_vals + 2 * std,
            size=(n_anomalies, normal_data.shape[1])
        )
    else:  # extreme method
        # Generate extreme value anomalies
        multipliers = np.random.choice([-3, 3], size=(n_anomalies, normal_data.shape[1]))
        synthetic_anomalies = mean + multipliers * std
    
    return synthetic_anomalies


In [140]:
def evaluate_with_synthetic_anomalies(normal_data, n_estimators=100, contamination=0.1, random_state=42):
    """
    Evaluate Isolation Forest using synthetic anomalies
    
    Parameters:
    normal_data: numpy array or pandas DataFrame of normal data
    contamination: expected proportion of outliers
    random_state: random seed for reproducibility
    
    Returns:
    clf: trained IsolationForest model
    metrics: dictionary containing evaluation metrics for each anomaly generation method
    """
    np.random.seed(random_state)
    
    # Convert to numpy array if it's a DataFrame
    if isinstance(normal_data, pd.DataFrame):
        normal_data = normal_data.to_numpy()
    
    # Generate synthetic anomalies
    n_anomalies = int(len(normal_data) * contamination)
    
    metrics = {}
    clf = IsolationForest(n_estimators=n_estimators,contamination=contamination, random_state=random_state)
    clf.fit(normal_data)
    
    # Test with different types of synthetic anomalies
    for method in ['gaussian', 'uniform', 'extreme']:
        # Generate anomalies
        synthetic_anomalies = generate_synthetic_anomalies(normal_data, n_anomalies, method)
        
        # Combine normal and anomaly data
        test_data = np.vstack([normal_data, synthetic_anomalies])
        
        # Create true labels (0 for normal, 1 for anomaly)
        true_labels = np.zeros(len(test_data))
        true_labels[len(normal_data):] = 1
        
        # Get predictions (convert from {1: normal, -1: anomaly} to {0: normal, 1: anomaly})
        predictions = (clf.predict(test_data) == -1).astype(int)
        metrics[method] = {
            'accuracy': accuracy_score(true_labels, predictions),
            'precision': precision_score(true_labels, predictions, zero_division=0),
            'recall': recall_score(true_labels, predictions, zero_division=0),
            'f1': f1_score(true_labels, predictions, zero_division=0)
        }
        
    return clf, metrics

In [141]:
def fetch_transactions():
    url = "http://localhost:3000/api/past_transactions"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json() 
    else:
        print("Error fetching data from backend")
        return []

In [142]:
data = fetch_transactions()
df = pd.DataFrame(data)
df['date'] = df['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d') if x != "NaN-NaN-NaN" else None)
df.dropna(inplace=True)

In [143]:
# Feature Engineering
df['hour'] = df['date'].dt.hour
df['day_of_week'] = df['date'].dt.dayofweek
df['time_of_month'] = df['date'].dt.day
df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

# df['Type'] = df['Type'].map({"Debit": 0, "Credit": 1})

# Normalize 'amount'
scaler = StandardScaler()
df['scaled_amount'] = scaler.fit_transform(df[['amount']])

categories=df["category"].unique()

encoder = OneHotEncoder(sparse_output=False)

# Reshape the category column into a 2D array as required by the encoder
category_reshaped = df['category'].values.reshape(-1, 1)

category_encoded = encoder.fit_transform(category_reshaped)

encoded_df = pd.DataFrame(category_encoded, columns=encoder.categories_[0])

df = pd.concat([df, encoded_df], axis=1).drop(columns=['category'])

# Select features for training
features = ['scaled_amount', 'hour', 'day_of_week', 'time_of_month', 'is_weekend']
features.extend(categories)
X = df[features]

In [144]:
n_estimators_list=[30,50,100,150,200,250]
contamination_list=[0.05,0.07,0.1]

for n_estimators in n_estimators_list:
    for contamination in contamination_list:
        print(f'For n_estimators: {n_estimators} and contamination of {contamination}')
        clf,metrics=evaluate_with_synthetic_anomalies(X,n_estimators,contamination)
        for key in metrics.keys():
            print(key)
            print(f'accuracy: {metrics[key]["accuracy"]}')
            print()
        print()

For n_estimators: 30 and contamination of 0.05
gaussian
accuracy: 0.9463087248322147

uniform
accuracy: 0.9463087248322147

extreme
accuracy: 0.9463087248322147


For n_estimators: 30 and contamination of 0.07
gaussian
accuracy: 0.9337748344370861

uniform
accuracy: 0.9337748344370861

extreme
accuracy: 0.9337748344370861


For n_estimators: 30 and contamination of 0.1
gaussian
accuracy: 0.9038461538461539

uniform
accuracy: 0.9038461538461539

extreme
accuracy: 0.9038461538461539


For n_estimators: 50 and contamination of 0.05
gaussian
accuracy: 0.9463087248322147

uniform
accuracy: 0.9463087248322147

extreme
accuracy: 0.9463087248322147


For n_estimators: 50 and contamination of 0.07
gaussian
accuracy: 0.9337748344370861

uniform
accuracy: 0.9337748344370861

extreme
accuracy: 0.9337748344370861


For n_estimators: 50 and contamination of 0.1
gaussian
accuracy: 0.9038461538461539

uniform
accuracy: 0.9038461538461539

extreme
accuracy: 0.9038461538461539


For n_estimators: 100 an

Choosing n_estimators as 100 and contamination as 0.05 <br>
**Accuracy**: 94.63%