In [None]:
pip install catboost

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from keras.models import Model
from keras.layers import Input, Dense
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, f1_score, precision_score, matthews_corrcoef, accuracy_score, classification_report, precision_recall_curve
from catboost import CatBoostClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import IsolationForest
import lightgbm as lgb
from sklearn.svm import OneClassSVM
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import LocalOutlierFactor

In [None]:
filename = "/kaggle/input/dataset/amaretto_dataset_anon.csv.csv"
df = pd.read_csv(filename)

In [None]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
df.shape[0]

In [None]:
df1=df.head(int(29704090*0.7))
df2=df.tail(int(29704090*0.3))

In [None]:
print(df1['Anomaly'].value_counts(normalize=True))
print(df2['Anomaly'].value_counts(normalize=True))

In [None]:
total_transactions_df1 = df1['Transaction ID'].count()
df1['num_transactions_proportion'] = 1 / total_transactions_df1

df1['EntryDate'] = pd.to_datetime(df1['EntryDate'])
df1['Weekday'] = df1['EntryDate'].dt.weekday
df1['Hour'] = df1['EntryDate'].dt.hour

df1['Morning'] = df1['Hour'].apply(lambda x: 1 if 6 <= x < 12 else 0)  # Mañana (6:00 - 11:59)
df1['Evening'] = df1['Hour'].apply(lambda x: 1 if 12 <= x < 18 else 0)  # Tarde (12:00 - 17:59)
df1['Night'] = df1['Hour'].apply(lambda x: 1 if 18 <= x < 24 or 0 <= x < 6 else 0)  # Noche (18:00 - 23:59), (00:00 - 5:59)

condition_round_amount = ((df1['Normalized Amount'].apply(lambda x: str(x).count('000.0')) >= 1) & (df1['Normalized Amount'] == df1['Normalized Amount'].round()))
df1['Round_Amount_Condition'] = condition_round_amount.astype(int)

condition_small_amount = ((df1['Normalized Amount'] < 1177.44) & (df1['Normalized Amount'] > 420.65) & (df1['Product Class'] == 'Cash in / out (withdrawal), Security in / out'))
df1['Small_Amount_Condition'] = condition_small_amount

condition_sell = df1['InputOutput'] == 'Sell'
df1['Sell_Condition'] = condition_sell
condition_buy = df1['InputOutput'] == 'Buy'
df1['Buy_Condition'] = condition_buy

inoutdelta = (df1['InputOutput'] == 'Buy').astype(int) - (df1['InputOutput'] == 'Sell').astype(int)
df1['InputOutput_Delta'] = inoutdelta

condition_sell_cash = df1['Normalized Amount'].where(condition_sell & (df1['Product Class'] == 'Cash in / out (withdrawal), Security in / out'), 0)
df1['condition_sell_cash'] = condition_sell_cash

condition_sell_cash2 = df1['Normalized Amount'].where(condition_sell & (df1['Product Type'] == 'SimpleTransfer'), 0)
df1['condition_sell_cash2'] = condition_sell_cash2

aggregated_dftr = df1.groupby(['Originator', 'Weekday', 'Hour', 'Morning', 'Evening', 'Night', 'Anomaly']).agg(
    num_transactions=('num_transactions_proportion', 'sum'),
    total_amount_traded=('Normalized Amount', 'mean'),
    transactions_count_small_amount=('Small_Amount_Condition', 'mean'),
    transactions_count_round_amount=('Round_Amount_Condition', 'mean'),
    transactions_count_amount_sell=('Sell_Condition', 'mean'),
    transactions_count_amount_buy=('Buy_Condition', 'mean'),
    cash_out_withdrawal_security_out=('condition_sell_cash', 'mean'),
    simpletranfer=('condition_sell_cash2', 'mean'),
    inputoutput_delta=('InputOutput_Delta', 'mean')
).reset_index()

df1.drop(columns=['Weekday', 'Hour', 'Morning', 'Evening', 'Night', 'Round_Amount_Condition', 'Small_Amount_Condition', 'Sell_Condition', 'Buy_Condition', 'InputOutput_Delta', 'condition_sell_cash', 'condition_sell_cash2','num_transactions_proportion'], inplace=True)

In [None]:
total_transactions_df2 = df2['Transaction ID'].count()
df2['num_transactions_proportion'] = 1 / total_transactions_df2

df2['EntryDate'] = pd.to_datetime(df2['EntryDate'])
df2['Weekday'] = df2['EntryDate'].dt.weekday
df2['Hour'] = df2['EntryDate'].dt.hour

df2['Morning'] = df2['Hour'].apply(lambda x: 1 if 6 <= x < 12 else 0)  # Mañana (6:00 - 11:59)
df2['Evening'] = df2['Hour'].apply(lambda x: 1 if 12 <= x < 18 else 0)  # Tarde (12:00 - 17:59)
df2['Night'] = df2['Hour'].apply(lambda x: 1 if 18 <= x < 24 or 0 <= x < 6 else 0)  # Noche (18:00 - 23:59), (00:00 - 5:59)

condition_round_amount = ((df2['Normalized Amount'].apply(lambda x: str(x).count('000.0')) >= 1) & (df2['Normalized Amount'] == df2['Normalized Amount'].round()))
df2['Round_Amount_Condition'] = condition_round_amount.astype(int)

condition_small_amount = ((df2['Normalized Amount'] < 1177.44) & (df2['Normalized Amount'] > 420.65) & (df2['Product Class'] == 'Cash in / out (withdrawal), Security in / out'))
df2['Small_Amount_Condition'] = condition_small_amount

condition_sell = df2['InputOutput'] == 'Sell'
df2['Sell_Condition'] = condition_sell
condition_buy = df2['InputOutput'] == 'Buy'
df2['Buy_Condition'] = condition_buy

inoutdelta = (df2['InputOutput'] == 'Buy').astype(int) - (df2['InputOutput'] == 'Sell').astype(int)
df2['InputOutput_Delta'] = inoutdelta

condition_sell_cash = df2['Normalized Amount'].where(condition_sell & (df2['Product Class'] == 'Cash in / out (withdrawal), Security in / out'), 0)
df2['condition_sell_cash'] = condition_sell_cash

condition_sell_cash2 = df2['Normalized Amount'].where(condition_sell & (df2['Product Type'] == 'SimpleTransfer'), 0)
df2['condition_sell_cash2'] = condition_sell_cash2

aggregated_dfte = df2.groupby(['Originator', 'Weekday', 'Hour', 'Morning', 'Evening', 'Night', 'Anomaly']).agg(
    num_transactions=('num_transactions_proportion', 'sum'),
    total_amount_traded=('Normalized Amount', 'mean'),
    transactions_count_small_amount=('Small_Amount_Condition', 'mean'),
    transactions_count_round_amount=('Round_Amount_Condition', 'mean'),
    transactions_count_amount_sell=('Sell_Condition', 'mean'),
    transactions_count_amount_buy=('Buy_Condition', 'mean'),
    cash_out_withdrawal_security_out=('condition_sell_cash', 'mean'),
    simpletranfer=('condition_sell_cash2', 'mean'),
    inputoutput_delta=('InputOutput_Delta', 'mean')
).reset_index()

df2.drop(columns=['Weekday', 'Hour', 'Morning', 'Evening', 'Night', 'Round_Amount_Condition', 'Small_Amount_Condition', 'Sell_Condition', 'Buy_Condition', 'InputOutput_Delta', 'condition_sell_cash', 'condition_sell_cash2','num_transactions_proportion'], inplace=True)

In [None]:
import gc

del df
del df1
del df2

gc.collect()

In [None]:
X_train = aggregated_dftr.drop(['Originator','Anomaly'], axis=1)
y_train = aggregated_dftr['Anomaly']
X_test = aggregated_dfte.drop(['Originator','Anomaly'], axis=1)
y_test = aggregated_dfte['Anomaly']

# **ALGORITMOS SUPERVISADOS**

**RANDOM FOREST**

In [None]:
start_time = time.time()
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=30,class_weight='balanced')
rf_classifier.fit(X_train, y_train)
end_time = time.time()
y_pred = rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

precision = precision_score(y_test, y_pred, average='macro')

f1 = f1_score(y_test, y_pred, average='macro')

conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Matriz de Confusión')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
report = classification_report(y_test, y_pred)
print("Reporte de clasificación:\n", report)

total_tp = total_fp = total_fn = total_tn = 0

for i in range(6):
    TP = conf_matrix[i, i]
    FP = conf_matrix[:, i].sum() - TP
    FN = conf_matrix[i, :].sum() - TP
    TN = conf_matrix.sum() - (TP + FP + FN)

    total_tp += TP
    total_fp += FP
    total_fn += FN
    total_tn += TN

TPR_global = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
FPR_global = total_fp / (total_fp + total_tn) if (total_fp + total_tn) > 0 else 0
FNR_global = total_fn / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
TNR_global = total_tn / (total_tn + total_fp) if (total_tn + total_fp) > 0 else 0

mcc_sklearn = matthews_corrcoef(y_test, y_pred)

print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('F1-Score: ', f1)
print('True Positive Rate: ', TPR_global)
print('False Positive Rate: ', FPR_global)
print('False Negative Rate: ', FNR_global)
print('True Negative Rate: ', TNR_global)
print('MCC: ', mcc_sklearn)
print('Training time (in seconds): ', end_time - start_time)

**CATBOOST**

In [None]:
classes, class_counts = np.unique(y_train, return_counts=True)
print("Clases:", classes)
print("Distribución de clases:", class_counts)

In [None]:
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights_dict = dict(zip(classes, class_weights))

print("Pesos de las clases:", class_weights_dict)

In [None]:
catboost_classifier = CatBoostClassifier(
    iterations=1000, 
    learning_rate=0.01, 
    depth=10, 
    random_seed=30, 
    class_weights=class_weights.tolist(),
    verbose=100
)

start_time = time.time()
catboost_classifier.fit(X_train, y_train)
end_time = time.time()
y_pred = catboost_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

precision = precision_score(y_test, y_pred, average='macro')

f1 = f1_score(y_test, y_pred, average='macro')

conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Matriz de Confusión')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
report = classification_report(y_test, y_pred)
print("Reporte de clasificación:\n", report)

total_tp = total_fp = total_fn = total_tn = 0

for i in range(6):
    TP = conf_matrix[i, i]
    FP = conf_matrix[:, i].sum() - TP
    FN = conf_matrix[i, :].sum() - TP
    TN = conf_matrix.sum() - (TP + FP + FN)

    total_tp += TP
    total_fp += FP
    total_fn += FN
    total_tn += TN

TPR_global = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
FPR_global = total_fp / (total_fp + total_tn) if (total_fp + total_tn) > 0 else 0
FNR_global = total_fn / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
TNR_global = total_tn / (total_tn + total_fp) if (total_tn + total_fp) > 0 else 0

mcc_sklearn = matthews_corrcoef(y_test, y_pred)

print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('F1-Score: ', f1)
print('True Positive Rate: ', TPR_global)
print('False Positive Rate: ', FPR_global)
print('False Negative Rate: ', FNR_global)
print('True Negative Rate: ', TNR_global)
print('MCC: ', mcc_sklearn)
print('Training time (in seconds): ', end_time - start_time)

**DECISIONTREE**

In [None]:
start_time = time.time()
tree_classifier = DecisionTreeClassifier(random_state=30, class_weight='balanced')
tree_classifier.fit(X_train, y_train)
end_time = time.time()
y_pred = tree_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

precision = precision_score(y_test, y_pred, average='macro')

f1 = f1_score(y_test, y_pred, average='macro')

conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Matriz de Confusión')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
report = classification_report(y_test, y_pred)
print("Reporte de clasificación:\n", report)

total_tp = total_fp = total_fn = total_tn = 0

for i in range(6):
    TP = conf_matrix[i, i]
    FP = conf_matrix[:, i].sum() - TP
    FN = conf_matrix[i, :].sum() - TP
    TN = conf_matrix.sum() - (TP + FP + FN)

    total_tp += TP
    total_fp += FP
    total_fn += FN
    total_tn += TN

TPR_global = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
FPR_global = total_fp / (total_fp + total_tn) if (total_fp + total_tn) > 0 else 0
FNR_global = total_fn / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
TNR_global = total_tn / (total_tn + total_fp) if (total_tn + total_fp) > 0 else 0

mcc_sklearn = matthews_corrcoef(y_test, y_pred)

print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('F1-Score: ', f1)
print('True Positive Rate: ', TPR_global)
print('False Positive Rate: ', FPR_global)
print('False Negative Rate: ', FNR_global)
print('True Negative Rate: ', TNR_global)
print('MCC: ', mcc_sklearn)
print('Training time (in seconds): ', end_time - start_time)

**NAIVE BAYES**

In [None]:
start_time=time.time()
naive_bayes_classifier = GaussianNB()
naive_bayes_classifier.fit(X_train, y_train)
end_time=time.time()
y_pred = naive_bayes_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

precision = precision_score(y_test, y_pred, average='macro')

f1 = f1_score(y_test, y_pred, average='macro')

conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Matriz de Confusión')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
report = classification_report(y_test, y_pred)
print("Reporte de clasificación:\n", report)

total_tp = total_fp = total_fn = total_tn = 0

for i in range(6):
    TP = conf_matrix[i, i]
    FP = conf_matrix[:, i].sum() - TP
    FN = conf_matrix[i, :].sum() - TP
    TN = conf_matrix.sum() - (TP + FP + FN)

    total_tp += TP
    total_fp += FP
    total_fn += FN
    total_tn += TN

TPR_global = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
FPR_global = total_fp / (total_fp + total_tn) if (total_fp + total_tn) > 0 else 0
FNR_global = total_fn / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
TNR_global = total_tn / (total_tn + total_fp) if (total_tn + total_fp) > 0 else 0

mcc_sklearn = matthews_corrcoef(y_test, y_pred)

print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('F1-Score: ', f1)
print('True Positive Rate: ', TPR_global)
print('False Positive Rate: ', FPR_global)
print('False Negative Rate: ', FNR_global)
print('True Negative Rate: ', TNR_global)
print('MCC: ', mcc_sklearn)
print('Training time (in seconds): ', end_time - start_time)

**LGBM**

In [None]:
start_time = time.time()
lgb_classifier = lgb.LGBMClassifier(random_state=30)
lgb_classifier.fit(X_train, y_train)
end_time = time.time()

y_pred = lgb_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

precision = precision_score(y_test, y_pred, average='macro')

f1 = f1_score(y_test, y_pred, average='macro')

conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Matriz de Confusión')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
report = classification_report(y_test, y_pred)
print("Reporte de clasificación:\n", report)

total_tp = total_fp = total_fn = total_tn = 0

for i in range(6):
    TP = conf_matrix[i, i]
    FP = conf_matrix[:, i].sum() - TP
    FN = conf_matrix[i, :].sum() - TP
    TN = conf_matrix.sum() - (TP + FP + FN)

    total_tp += TP
    total_fp += FP
    total_fn += FN
    total_tn += TN

TPR_global = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
FPR_global = total_fp / (total_fp + total_tn) if (total_fp + total_tn) > 0 else 0
FNR_global = total_fn / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
TNR_global = total_tn / (total_tn + total_fp) if (total_tn + total_fp) > 0 else 0

mcc_sklearn = matthews_corrcoef(y_test, y_pred)

print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('F1-Score: ', f1)
print('True Positive Rate: ', TPR_global)
print('False Positive Rate: ', FPR_global)
print('False Negative Rate: ', FNR_global)
print('True Negative Rate: ', TNR_global)
print('MCC: ', mcc_sklearn)
print('Training time (in seconds): ', end_time - start_time)

# **ALGORITMOS NO SUPERVISADOS**

**ISOLATION FOREST**

In [None]:
y_train_binary = [1 if label > 0 else 0 for label in y_train]
y_test_binary = [1 if label > 0 else 0 for label in y_test]

start_time = time.time()
iso_forest = IsolationForest(contamination=0.030937, random_state=30)
iso_forest.fit(X_train)
end_time = time.time()

y_pred = iso_forest.predict(X_test)
y_pred_binary = [1 if label == -1 else 0 for label in y_pred]

accuracy = accuracy_score(y_test_binary, y_pred_binary)

precision = precision_score(y_test_binary, y_pred_binary, average='macro')

f1 = f1_score(y_test_binary, y_pred_binary, average='macro')

conf_matrix = confusion_matrix(y_test_binary, y_pred_binary)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Matriz de Confusión - Normal vs Anomalías')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
report = classification_report(y_test_binary, y_pred_binary)
print("Reporte de clasificación (normal vs anomalías):\n", report)

tp = conf_matrix[1, 1]
fn = conf_matrix[1, 0]
fp = conf_matrix[0, 1]
tn = conf_matrix[0, 0]

tpr_iso = tp / (tp + fn)
fpr_iso = fp / (fp + tn)
fnr_iso = fn / (fn + tp) if (fn + tp) > 0 else 0
tnr_iso = tn / (tn + fp) if (tn + fp) > 0 else 0

mcc_sklearn = matthews_corrcoef(y_test_binary, y_pred_binary)

print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('F1-Score: ', f1)
print('True Positive Rate: ', tpr_iso)
print('False Positive Rate: ', fpr_iso)
print('False Negative Rate: ', fnr_iso)
print('True Negative Rate: ', tnr_iso)
print('MCC: ', mcc_sklearn)
print('Training time (in seconds): ', end_time - start_time)

**AUTOENCODER**

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

input_dim = X_train_scaled.shape[1]
input_layer = Input(shape=(input_dim,))
encoder = Dense(32, activation='relu')(input_layer)
encoder = Dense(16, activation='relu')(encoder)
decoder = Dense(32, activation='relu')(encoder)
output_layer = Dense(input_dim, activation='sigmoid')(decoder)

autoencoder = Model(inputs=input_layer, outputs=output_layer)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

autoencoder.fit(X_train_scaled, X_train_scaled, epochs=50, batch_size=32, shuffle=True)

X_test_reconstructed = autoencoder.predict(X_test_scaled)
reconstruction_errors = np.mean(np.power(X_test_scaled - X_test_reconstructed, 2), axis=1)

threshold = np.percentile(reconstruction_errors, 95)

y_pred_binary = [1 if error > threshold else 0 for error in reconstruction_errors]

accuracy = accuracy_score(y_test_binary, y_pred_binary)

precision = precision_score(y_test_binary, y_pred_binary, average='macro')

f1 = f1_score(y_test_binary, y_pred_binary, average='macro')

conf_matrix = confusion_matrix(y_test_binary, y_pred_binary)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Matriz de Confusión - Normal vs Anomalías')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
report = classification_report(y_test_binary, y_pred_binary)
print("Reporte de clasificación (normal vs anomalías):\n", report)

tp = conf_matrix[1, 1]
fn = conf_matrix[1, 0]
fp = conf_matrix[0, 1]
tn = conf_matrix[0, 0]

tpr_auto = tp / (tp + fn)
fpr_auto = fp / (fp + tn)
fnr_auto = fn / (fn + tp) if (fn + tp) > 0 else 0
tnr_auto = tn / (tn + fp) if (tn + fp) > 0 else 0

mcc_sklearn = matthews_corrcoef(y_test_binary, y_pred_binary)

print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('F1-Score: ', f1)
print('True Positive Rate: ', tpr_auto)
print('False Positive Rate: ', fpr_auto)
print('False Negative Rate: ', fnr_auto)
print('True Negative Rate: ', tnr_auto)
print('MCC: ', mcc_sklearn)

**ONE CLASS SVM**

In [None]:
start_time = time.time()
one_class_svm = OneClassSVM(kernel='rbf', gamma='scale', nu=0.1)
one_class_svm.fit(X_train)
end_time = time.time()

y_pred = one_class_svm.predict(X_test)
y_pred_binary = [1 if label == -1 else 0 for label in y_pred]

accuracy = accuracy_score(y_test_binary, y_pred_binary)

precision = precision_score(y_test_binary, y_pred_binary, average='macro')

f1 = f1_score(y_test_binary, y_pred_binary, average='macro')

conf_matrix = confusion_matrix(y_test_binary, y_pred_binary)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Matriz de Confusión - Normal vs Anomalías (One-Class SVM)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
report = classification_report(y_test_binary, y_pred_binary)
print("Reporte de clasificación (normal vs anomalías):\n", report)

tp = conf_matrix[1, 1]
fn = conf_matrix[1, 0]
fp = conf_matrix[0, 1]
tn = conf_matrix[0, 0]

tpr_svm = tp / (tp + fn)
fpr_svm = fp / (fp + tn)
fnr_svm = fn / (fn + tp) if (fn + tp) > 0 else 0
tnr_svm = tn / (tn + fp) if (tn + fp) > 0 else 0

mcc_svm = matthews_corrcoef(y_test_binary, y_pred_binary)

print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('F1-Score: ', f1)
print('True Positive Rate: ', tpr_svm)
print('False Positive Rate: ', fpr_svm)
print('False Negative Rate: ', fnr_svm)
print('True Negative Rate: ', tnr_svm)
print('MCC: ', mcc_svm)
print('Training time (in seconds): ', end_time - start_time)

**K-NEAREST NEIGHBORS**

In [None]:
k = 7
start_time = time.time()
knn = NearestNeighbors(n_neighbors=k)
knn.fit(X_train)
end_time = time.time()

distances, indices = knn.kneighbors(X_test)
anomaly_scores = np.mean(distances, axis=1)
threshold = np.percentile(anomaly_scores, 95)
y_pred_binary = [1 if score > threshold else 0 for score in anomaly_scores]

accuracy = accuracy_score(y_test_binary, y_pred_binary)

precision = precision_score(y_test_binary, y_pred_binary, average='macro')

f1 = f1_score(y_test_binary, y_pred_binary, average='macro')

conf_matrix = confusion_matrix(y_test_binary, y_pred_binary)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Matriz de Confusión - Normal vs Anomalías (KNN)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
report = classification_report(y_test_binary, y_pred_binary)
print("Reporte de clasificación (normal vs anomalías):\n", report)

tp = conf_matrix[1, 1]
fn = conf_matrix[1, 0]
fp = conf_matrix[0, 1]
tn = conf_matrix[0, 0]

tpr_knn = tp / (tp + fn)
fpr_knn = fp / (fp + tn)
fnr_knn = fn / (fn + tp) if (fn + tp) > 0 else 0
tnr_knn = tn / (tn + fp) if (tn + fp) > 0 else 0

mcc_knn = matthews_corrcoef(y_test_binary, y_pred_binary)

print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('F1-Score: ', f1)
print('True Positive Rate: ', tpr_knn)
print('False Positive Rate: ', fpr_knn)
print('False Negative Rate: ', fnr_knn)
print('True Negative Rate: ', tnr_knn)
print('MCC: ', mcc_knn)
print('Training time (in seconds): ', end_time - start_time)

**LOCAL OUTLIER FACTOR**

In [None]:
start_time = time.time()
lof = LocalOutlierFactor(n_neighbors=20)
y_pred = lof.fit_predict(X_test)
end_time = time.time()

y_pred_binary = [1 if label == -1 else 0 for label in y_pred]

accuracy = accuracy_score(y_test_binary, y_pred_binary)

precision = precision_score(y_test_binary, y_pred_binary, average='macro', zero_division=0)

f1 = f1_score(y_test_binary, y_pred_binary, average='macro', zero_division=0)

conf_matrix = confusion_matrix(y_test_binary, y_pred_binary)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Matriz de Confusión - Normal vs Anomalías (LOF)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
report = classification_report(y_test_binary, y_pred_binary)
print("Reporte de clasificación (normal vs anomalías - LOF):\n", report)

tp = conf_matrix[1, 1]
fn = conf_matrix[1, 0]
fp = conf_matrix[0, 1]
tn = conf_matrix[0, 0]

tpr_lof = tp / (tp + fn) if (tp + fn) > 0 else 0
fpr_lof = fp / (fp + tn) if (fp + tn) > 0 else 0
fnr_lof = fn / (fn + tp) if (fn + tp) > 0 else 0
tnr_lof = tn / (tn + fp) if (tn + fp) > 0 else 0

mcc_lof = matthews_corrcoef(y_test_binary, y_pred_binary)

print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('F1-Score: ', f1)
print('True Positive Rate: ', tpr_lof)
print('False Positive Rate: ', fpr_lof)
print('False Negative Rate: ', fnr_lof)
print('True Negative Rate: ', tnr_lof)
print('MCC: ', mcc_lof)
print('Training time (in seconds): ', end_time - start_time)

In [None]:
y_scores_iso = iso_forest.decision_function(X_test)
auc_iso = roc_auc_score(y_test_binary, -y_scores_iso)
fpr_iso, tpr_iso, _ = roc_curve(y_test_binary, -y_scores_iso)

reconstruction_errors = np.mean(np.power(X_test_scaled - X_test_reconstructed, 2), axis=1)
auc_auto = roc_auc_score(y_test_binary, reconstruction_errors)
fpr_auto, tpr_auto, _ = roc_curve(y_test_binary, reconstruction_errors)

y_scores_svm = one_class_svm.decision_function(X_test)
auc_svm = roc_auc_score(y_test_binary, -y_scores_svm)
fpr_svm, tpr_svm, _ = roc_curve(y_test_binary, -y_scores_svm)

auc_knn = roc_auc_score(y_test_binary, anomaly_scores)
fpr_knn, tpr_knn, _ = roc_curve(y_test_binary, anomaly_scores)

y_scores_lof = -lof.negative_outlier_factor_
auc_lof = roc_auc_score(y_test_binary, y_scores_lof)
fpr_lof, tpr_lof, _ = roc_curve(y_test_binary, y_scores_lof)

In [None]:
sns.set_style("darkgrid")

plt.figure(figsize=(10, 6))
plt.plot(fpr_iso, tpr_iso, color='blue', lw=1.5, label='Isolation Forest (AUC = %0.2f)' % auc_iso)
plt.plot(fpr_auto, tpr_auto, color='orange', lw=1.5, label='Autoencoder (AUC = %0.2f)' % auc_auto)
plt.plot(fpr_svm, tpr_svm, color='blue', linestyle='--', lw=1.5, label='One-Class SVM (AUC = %0.2f)' % auc_svm)
plt.plot(fpr_lof, tpr_lof, color='gray', lw=1.5, label='LOF (AUC = %0.2f)' % auc_lof)
plt.plot(fpr_knn, tpr_knn, color='orange', linestyle='--', lw=1.5, label='KNN (AUC = %0.2f)' % auc_knn)

plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=10) 
plt.ylabel('True Positive Rate', fontsize=10)  
plt.title('Curva ROC - Comparación de Modelos')
plt.legend(loc="lower right")
plt.show()

In [None]:
precision_iso, recall_iso, _ = precision_recall_curve(y_test_binary, -y_scores_iso)
precision_auto, recall_auto, _ = precision_recall_curve(y_test_binary, reconstruction_errors)
precision_svm, recall_svm, _ = precision_recall_curve(y_test_binary, -y_scores_svm)
precision_knn, recall_knn, _ = precision_recall_curve(y_test_binary, anomaly_scores)
precision_lof, recall_lof, _ = precision_recall_curve(y_test_binary, y_scores_lof)

In [None]:
sns.set_style("darkgrid")

plt.figure(figsize=(10, 6))
plt.plot(recall_iso, precision_iso, color='blue', lw=1.5, label='Isolation Forest')
plt.plot(recall_auto, precision_auto, color='orange', lw=1.5, label='Autoencoder')
plt.plot(recall_svm, precision_svm, color='blue', linestyle='--', lw=1.5, label='One-Class SVM')
plt.plot(recall_knn, precision_knn, color='orange', linestyle='--', lw=1.5, label='KNN')
plt.plot(recall_lof, precision_lof, color='gray', lw=1.5, label='LOF')

plt.xlabel('Recall', fontsize=10)
plt.ylabel('Precision', fontsize=10)
plt.title('Precision-Recall Curves - Comparación de Modelos')
plt.legend(loc="upper right")
plt.show()