# 2. Modelling

# 2.1 Robus PrincipalComponentAnalysis(rPCA)

In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

from pyod.models.hbos import HBOS
from sklearn.ensemble import RandomForestClassifier

In [54]:
print(final_df.dtypes)

date                            datetime64[ns]
time                            datetime64[ns]
trx_id                                  object
trx_type                                object
trx_status                              object
category                                object
business_type_class                     object
wallet_number_from                     float64
wallet_number_to                         int64
amount                                 float64
description                             object
currency                                object
invoice_details                         object
ip                                      object
browser_environment                     object
amount_bin                              object
datetime                        datetime64[ns]
hour                                     int64
day_of_week                              int64
Wallet_to_change_frequency             float64
IP Change Frequency                    float64
First Transac

In [55]:
final_df = pd.read_csv("Final_DF.csv")

final_df['date'] = pd.to_datetime(final_df['date'])
final_df['time'] = pd.to_datetime(final_df['time'])

numerical_features = final_df.select_dtypes(include=[np.number])

# Handle NaN values, here simply use the mean to fill in the NaN values
numerical_features.fillna(numerical_features.mean(), inplace=True)

# Normalize numerical features
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numerical_features)

  final_df['time'] = pd.to_datetime(final_df['time'])


In [None]:
# Exclude time-related columns
features = final_df.drop(['date', 'time'], axis=1)

# Get non-numeric columns
non_numeric_cols = features.select_dtypes(include=['object']).columns

# Use label coding
label_encoder = LabelEncoder()
for col in non_numeric_cols:
    features[col] = label_encoder.fit_transform(features[col])

# Acquisition of features and target variables
X = features.drop('flagged', axis=1)
y = features['flagged']

# Create a random forest classifier model
rf_classifier = RandomForestClassifier()

rf_classifier.fit(X, y)

# Access to feature significance
feature_importances = rf_classifier.feature_importances_

importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

importance_df = importance_df.sort_values(by='Importance', ascending=False)

print(importance_df)

plt.figure(figsize=(10, 6))

plt.barh(importance_df['Feature'], importance_df['Importance'])

plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance')

plt.show()

In [None]:
# Convert 'datetime' and 'First Transaction Date'columns to datetime type
final_df['datetime'] = pd.to_datetime(final_df['datetime'])
final_df['First Transaction Date'] = pd.to_datetime(final_df['First Transaction Date'])

# Convert datetime to Unix timestamps (in seconds)
final_df['datetime_unix'] = final_df['datetime'].astype(int) // 10**9  # Convert to seconds
final_df['first_transaction_unix'] = final_df['First Transaction Date'].astype(int) // 10**9  # Convert to seconds

In [None]:
print(final_df.dtypes)

In [None]:
# Selection of features
selected_features = ['amount', 'datetime_unix', 'Wallet_to_change_frequency', 'Transaction Count', 'first_transaction_unix', 'Days Since First Transaction',
                     'Transaction Frequency','Adjusted Account Age','IP Change Frequency']

# Extract selected features
X = final_df[selected_features]

# Define scaler
scaler = StandardScaler()

# Define PCA
pca = PCA()

# Create pipeline
pipeline = Pipeline([
    ('scaler', scaler),
    ('pca', pca)
])

# Define parameter distribution for random search
param_dist = {
    'pca__n_components': np.arange(2, 10),  # Range of number of principal components
    'pca__svd_solver': ['auto', 'full', 'arpack', 'randomized']  # Solver for SVD
}

# Perform random search
random_search = RandomizedSearchCV(pipeline, param_dist, cv=5, n_iter=10, random_state=42)
random_search.fit(X)

# Output best parameters
print("Best parameters found by random search:")
print(random_search.best_params_)

In [None]:
# Selection of features
selected_features = ['amount', 'datetime_unix', 'Wallet_to_change_frequency', 'Transaction Count', 'first_transaction_unix', 'Days Since First Transaction',
                     'Transaction Frequency','Adjusted Account Age','IP Change Frequency']

# Extract selected features
X = final_df[selected_features]

# Define scaler
scaler = StandardScaler()

# Apply Robust PCA modeling with best hyperparameters
rpca = PCA(n_components=6, svd_solver='full')
principal_components = rpca.fit_transform(X)

# Calculate the squared reconstruction error
reconstruction_errors = np.square(X - rpca.inverse_transform(principal_components)).sum(axis=1)

# Set a threshold for anomaly detection
threshold = np.percentile(reconstruction_errors, 95)  # for example, 95th percentile

# Label data points as outliers if their reconstruction error is above the threshold
is_outlier = reconstruction_errors > threshold

# Output the counts of outliers and inliers
num_outliers = np.sum(is_outlier)
num_inliers = X.shape[0] - num_outliers
print("Number of outliers:", num_outliers)
print("Number of inliers:", num_inliers)

# Generate classification report
y_true = final_df['flagged']
y_pred = is_outlier.astype(int)
report = classification_report(y_true, y_pred)
print(report)

In [None]:
import matplotlib.pyplot as plt

# Plotting relationships between principal components
plt.scatter(final_df['principal_component_1'], final_df['principal_component_2'], c=final_df['is_outlier'], cmap='viridis')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('Outlier Detection')
plt.colorbar(label='Flagged as Outlier')
plt.show()

# 2.2 HBOS（Historical Based Outlier Score）

In [None]:
# Define the features used by the HBOS model
features = ['amount', 'datetime_unix', 'Wallet_to_change_frequency', 'Transaction Count', 'first_transaction_unix', 'Days Since First Transaction',
            'Transaction Frequency','Adjusted Account Age','IP Change Frequency']

# Initialize HBOS model with specified contamination factors
hbos_model = HBOS(contamination=0.05)  

# Standardized selected features
scaler = StandardScaler()
scaled_data = scaler.fit_transform(final_df[features])

# Adaptation of HBOS models with standardized data
hbos_model.fit(scaled_data)

# Get the model's decision score (higher values indicate more anomalous)
final_df['hbos_score'] = hbos_model.decision_function(scaled_data)

# Print data frame with HBOS scores
print(final_df[['hbos_score'] + features])

# Determine thresholds for flagging outliers
threshold = final_df['hbos_score'].quantile(0.95)

# Create a new column that marks scores above the threshold as exceptions
final_df['predicted_label'] = (final_df['hbos_score'] > threshold).astype('int')

# The real label column 'flagged' should already exist in final_df
# Convert them to the same format as the predicted labels
final_df['true_label'] = final_df['flagged'].astype('int')

report = classification_report(final_df['true_label'], final_df['predicted_label'], target_names=['False', 'True'])
print(report)

# Filter outliers and non-outliers based on thresholds
outliers = final_df[final_df['hbos_score'] > threshold]
inliers = final_df[final_df['hbos_score'] <= threshold]

# of outliers and non-outliers counted
num_outliers = outliers.shape[0]
num_inliers = inliers.shape[0]

# Count the number of each class
class_counts = final_df['predicted_label'].value_counts()

# Output the counts of each class
print("Number of outliers:", class_counts[1])
print("Number of inliers:", class_counts[0])

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import average_precision_score
from pyod.models.hbos import HBOS

# Define the features used by the HBOS model
features = ['amount', 'datetime_unix', 'Wallet_to_change_frequency', 'Transaction Count', 'first_transaction_unix', 'Days Since First Transaction',
            'Transaction Frequency','Adjusted Account Age','IP Change Frequency']

# Initialize HBOS model
hbos_model = HBOS()

# Define parameter grid
param_grid = {
    'contamination': [0.01, 0.05, 0.1, 0.15]  # Adjust contamination factor
}

# Perform grid search
grid_search = GridSearchCV(hbos_model, param_grid, cv=5, scoring='average_precision')
grid_search.fit(final_df[features])

# Output best parameters
print("Best parameters found by grid search:")
print(grid_search.best_params_)

In [None]:
from pyod.models.hbos import HBOS
from sklearn.preprocessing import StandardScaler

# Define the features used by the HBOS model
features = ['amount', 'datetime_unix', 'Wallet_to_change_frequency', 'Transaction Count', 'first_transaction_unix', 'Days Since First Transaction',
            'Transaction Frequency','Adjusted Account Age','IP Change Frequency']

# Initialize HBOS model with best parameters
hbos_model = HBOS(contamination=0.01)

# Standardized selected features
scaler = StandardScaler()
scaled_data = scaler.fit_transform(final_df[features])

# Fit the HBOS model to the standardized data
hbos_model.fit(scaled_data)

# Get the model's decision score (higher values indicate more anomalous)
final_df['hbos_score'] = hbos_model.decision_function(scaled_data)

# Print data frame with HBOS scores
print(final_df[['hbos_score'] + features])

# Determine thresholds for flagging outliers
threshold = final_df['hbos_score'].quantile(0.95)

# Create a new column that marks scores above the threshold as exceptions
final_df['predicted_label'] = (final_df['hbos_score'] > threshold).astype('int')

# The real label column 'flagged' should already exist in final_df
# Convert them to the same format as the predicted labels
final_df['true_label'] = final_df['flagged'].astype('int')

# Generate classification report
from sklearn.metrics import classification_report
report = classification_report(final_df['true_label'], final_df['predicted_label'], target_names=['False', 'True'])
print(report)

# Filter outliers and non-outliers based on thresholds
outliers = final_df[final_df['hbos_score'] > threshold]
inliers = final_df[final_df['hbos_score'] <= threshold]

# Count the number of each class
class_counts = final_df['predicted_label'].value_counts()

# Output the counts of each class
print("Number of outliers:", class_counts[1])
print("Number of inliers:", class_counts[0])

In [None]:
import matplotlib.pyplot as plt

# Plotting histograms of anomaly scores
plt.hist(final_df['hbos_score'], bins=20, edgecolor='black')
plt.axvline(x=threshold, color='red', linestyle='--', label='Threshold')
plt.xlabel('HBOS Score')
plt.ylabel('Frequency')
plt.title('Histogram of HBOS Scores')
plt.legend()
plt.show()