In [1]:
import numpy as np
import pandas as pd
import seaborn as sns


from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import f1_score, accuracy_score, matthews_corrcoef, precision_score, roc_auc_score, recall_score, average_precision_score
from tqdm import tqdm
from matplotlib import pyplot as plt

In [2]:
y_train = pd.read_csv('../fraud_detection/active_datasets/y_train.csv', header=None).to_numpy().ravel()
y_test = pd.read_csv('../fraud_detection/active_datasets/y_test.csv', header=None).to_numpy().ravel()

In [3]:
X_train = pd.read_csv('../fraud_detection/active_datasets/fff_train.csv', header=None).to_numpy()
X_test = pd.read_csv('../fraud_detection/active_datasets/fff_test.csv', header=None).to_numpy()

In [4]:
sum(y_test), len(y_test), sum(y_test)/len(y_test)

(1750, 148661, 0.011771749147389026)

In [5]:
train_fraud_rate = sum(y_train)/len(y_train)

In [6]:
batch_size = 20/train_fraud_rate
batch_num = len(X_train)//batch_size + 1
print(batch_num)

273.0


In [7]:
X_training_batches = np.array_split(X_train, batch_num)
y_training_batches = np.array_split(y_train, batch_num)
np.mean([y.sum() for y in y_training_batches])

19.963369963369964

In [8]:
ensemble = list()

for i, X in tqdm(enumerate(X_training_batches)):
    y = y_training_batches[i]
    LOF = LocalOutlierFactor(n_neighbors=20, novelty=True, contamination=sum(y_train)/len(y_train))
    LOF.fit(X)
    ensemble.append(LOF)


273it [00:20, 13.14it/s]


In [9]:
batch_num = len(X_test)//batch_size + 1

X_testing_batches = np.array_split(X_test, batch_num)
y_testing_batches = np.array_split(y_test, batch_num)
np.mean([y.sum() for y in y_testing_batches])

19.23076923076923

In [10]:
preds = list()

for X in tqdm(X_testing_batches):
    ensemble_preds = [lof.predict(X) for lof in ensemble]
    ensemble_preds = np.array(ensemble_preds)
    batch_predictions = ensemble_preds.mean(axis=0) < 0
    preds.extend(batch_predictions)

preds

 68%|██████▊   | 62/91 [23:40<38:15, 79.14s/it]   

In [None]:
scores = {}
evaluation_funcs = {'accuracy':accuracy_score, 'precision':precision_score, 'recall':recall_score, 'f1':f1_score, 'MCC':matthews_corrcoef}
for name, score in evaluation_funcs.items():
        scores[name] = score(y_pred = preds, y_true = y_test)
scores['ROC-AUC'] = roc_auc_score(y_score = preds, y_true = y_test)
scores['AUPR'] = average_precision_score(y_score = preds, y_true = y_test)
scores

{'accuracy': 0.9774991423439907,
 'precision': 0.028942705256940343,
 'recall': 0.028,
 'f1': 0.028463549230322394,
 'MCC': 0.017086866769703047,
 'ROC-AUC': 0.508404775680514,
 'AUPR': 0.012252535918456462}

In [None]:
sum(preds)

1693