# Introduction

This notebook aims to detect anomalies in financial markets using the Isolation Forest model.

In [123]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import classification_report, confusion_matrix

# Data Cleaning

In [99]:
dataset = pd.read_excel('../data/FinancialMarketData.xlsx', sheet_name='EWS' )
correlation_matrix = dataset.corr()

low_cor_cols=correlation_matrix['Y'][abs(correlation_matrix['Y'])<0.1].index

#cleaning data
dataset_cleaned=dataset.drop(columns=low_cor_cols)

if 'Data' in dataset_cleaned.columns:
    dataset_cleaned = dataset_cleaned.drop(columns=['Data'])
# Calculer les Q1, Q3 et l'IQR pour chaque colonne
Q1 = dataset_cleaned.quantile(0.25)
Q3 = dataset_cleaned.quantile(0.75)
IQR = Q3 - Q1

# Identifier les outliers pour chaque colonne
outliers = ((dataset_cleaned < (Q1 - 1.5 * IQR)) | (dataset_cleaned > (Q3 + 1.5 * IQR)))

# Compter le nombre d'outliers par colonne
outliers_count = outliers.sum()
total_outliers=outliers_count.sum()
total_entries = dataset_cleaned.shape[0] * dataset_cleaned.shape[1]
contamination_estimation = total_outliers / total_entries
print(contamination_estimation)


0.016263695335050746


# Data Preperation

In [107]:
X= dataset_cleaned.drop(columns=['Y'])
y=dataset_cleaned['Y']
print(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")

0       0
1       0
2       0
3       0
4       1
       ..
1106    0
1107    0
1108    0
1109    0
1110    0
Name: Y, Length: 1111, dtype: int64
Shape of X_train: (777, 28)
Shape of X_test: (334, 28)


# Model Training

In [127]:
# Initialize the Isolation Forest model
model = IsolationForest(n_estimators=700, contamination=0.25 , random_state=40)

# Training the model on the training set
model.fit(X_train)

# Making predictions on the test set
y_pred = model.predict(X_test)
# Map predictions (-1 to 1 for anomaly, 1 to 0 for normal)
y_pred = np.where(y_pred == -1, 1, 0)
report = classification_report(y_test, y_pred, target_names=["Normal", "Anomaly"])
print(report)

              precision    recall  f1-score   support

      Normal       0.87      0.83      0.85       266
     Anomaly       0.44      0.51      0.47        68

    accuracy                           0.77       334
   macro avg       0.65      0.67      0.66       334
weighted avg       0.78      0.77      0.77       334

