In [2]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, matthews_corrcoef, precision_score, roc_auc_score, recall_score
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.neighbors import KNeighborsClassifier, LocalOutlierFactor
from sklearn.neural_network import MLPClassifier
from sklearn import svm

In [3]:
fraud_df = pd.read_csv('banksim_dataset/bs140513_032310.csv')
fraud_df.head()

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
0,0,'C1093826151','4','M','28007','M348934600','28007','es_transportation',4.55,0
1,0,'C352968107','2','M','28007','M348934600','28007','es_transportation',39.68,0
2,0,'C2054744914','4','F','28007','M1823072687','28007','es_transportation',26.89,0
3,0,'C1760612790','3','M','28007','M348934600','28007','es_transportation',17.25,0
4,0,'C757503768','5','M','28007','M348934600','28007','es_transportation',35.72,0


In [4]:
fraud_df['age'].replace({"'0'":0, "'1'":1, "'2'":2,"'3'":3, "'4'":4,"'5'":5,"'6'":6,"'U'":np.nan}, inplace=True)
fraud_df['age'].fillna(fraud_df['age'].mean(), inplace=True)

fraud_df['gender'].replace({"'M'":0,"'F'":1, "'E'":2, "'U'":2}, inplace=True)


fraud_df.drop(columns = ['zipcodeOri', 'zipMerchant'], inplace=True)

fraud_df['customer'] = fraud_df['customer'].str[1:-1]
fraud_df['merchant'] = fraud_df['merchant'].str[1:-1]
fraud_df['category'] = fraud_df['category'].str[1:-1]

fraud_np = fraud_df.to_numpy()
fraud_df.head()

Unnamed: 0,step,customer,age,gender,merchant,category,amount,fraud
0,0,C1093826151,4.0,0,M348934600,es_transportation,4.55,0
1,0,C352968107,2.0,0,M348934600,es_transportation,39.68,0
2,0,C2054744914,4.0,1,M1823072687,es_transportation,26.89,0
3,0,C1760612790,3.0,0,M348934600,es_transportation,17.25,0
4,0,C757503768,5.0,0,M348934600,es_transportation,35.72,0


In [5]:
ordinal_enc = preprocessing.OrdinalEncoder()
fraud_np[:, 1] = ordinal_enc.fit_transform(fraud_np[:, 1].reshape(-1, 1)).reshape(-1)
fraud_np[:, 4] = ordinal_enc.fit_transform(fraud_np[:, 4].reshape(-1, 1)).reshape(-1)
fraud_np[:, 5] = ordinal_enc.fit_transform(fraud_np[:, 5].reshape(-1, 1)).reshape(-1)

In [6]:
X = fraud_np[:, :-1]
y = fraud_np[:, -1].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=1)

In [7]:
evaluation_funcs = [accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score]

In [8]:
# desc_tree = tree.DecisionTreeClassifier()
# desc_tree = desc_tree.fit(X_train, y_train)
# desc_tree_preds = desc_tree.predict(X_test)
# print(f"descision tree scores: {[x(desc_tree_preds, y_test) for x in evaluation_funcs]}")

In [9]:
# random_forest = RandomForestClassifier()
# random_forest = random_forest.fit(X_train, y_train)
# random_forest_preds = random_forest.predict(X_test)
# print(f"random forest scores: {[x(random_forest_preds, y_test) for x in evaluation_funcs]}")

In [10]:
# KNN = KNeighborsClassifier()
# KNN = KNN.fit(X_train, y_train)
# KNN_preds = KNN.predict(X_test)
# print(f"KNN scores: {[x(KNN_preds, y_test) for x in evaluation_funcs]}")

In [11]:
# MLP = MLPClassifier([4, 4, 4])
# MLP = MLP.fit(X_train, y_train)
# MLP_preds = MLP.predict(X_test)
# print(f"MLP scores: {[x(MLP_preds, y_test) for x in evaluation_funcs]}")

In [12]:
# SVM = svm.SVC()
# SVM = SVM.fit(X_train, y_train)
# SVM_preds = SVM.predict(X_test)
# print(f"SVM scores: {[x(SVM_preds, y_test) for x in evaluation_funcs]}")

In [34]:
all_data = fraud_np[:, :-1]
all_labels = fraud_np[:, -1].astype(int)

LOF = LocalOutlierFactor(n_neighbors=20, contamination=7200/587443)
LOF_preds = LOF.fit_predict(all_data)
LOF_map = {1:0, -1:1}
LOF_preds = [LOF_map[i] for i in LOF_preds]
print((np.count_nonzero(LOF_preds)/len(LOF_preds)), 7200/587443)
print(f"LOF scores: {[x(LOF_preds, all_labels) for x in evaluation_funcs]}")

0.012257774832967007 0.012256508291017171
LOF scores: [0.9759469126854263, 0.012916666666666667, 0.012758951845246261, 0.01283732486714059, 0.0006629333644480935, 0.5003294617914518]


In [None]:
IF = IsolationForest()
IF = IF.fit(X_train, y_train)
IF_preds = IF.predict(X_test)
print(f"IF scores: {[x(IF_preds, y_test) for x in evaluation_funcs]}")