In [5]:
import pandas as pd
import numpy as np

In [6]:
# Imports Local Outlier Factor code
%run Pipeline_Code.ipynb

In [48]:
#file path to credit card csv file
file_path = os.path.join(os.path.expanduser("~"), "Desktop", "archive(2)", "creditcard.csv")
df = pd.read_csv(file_path) #read csv file as pandas object
CC_data = df.to_numpy() #CC_data will contain the Credit Card Fraud detection dataset as a numpy object

#Getting a features-only dataset
features = CC_data[0:5000]
features = np.array([arr[:-1] for arr in features])  

labels = np.array(df["Class"][0:5000]) # labels

fraud_instances = []
for i in range(len(features)):
    if features[i][-1] == 1:
        fraud_instances.append(i)
        
features2 = CC_data[5000:10000]
features2 = np.array([arr[:-1] for arr in features])  

labels2 = np.array(df["Class"][5000:10000]) # labels

In [49]:
lof = Local_Outlier_Factor(5, "euclidean", len(features))
lof.createOutlierFactor(features, 5)
lof.findThreshold(labels)
print(lof.threshold)
predictedLabels = lof.predict(features, False, 10)
f1Score = lof.matrixScores(lof.confusionMatrix(labels, predictedLabels), "f1score")
print(f1Score)
print(lof.confusionMatrix(labels, predictedLabels))
predictedLabelsTest = lof.predict(features2, True, 10)
testf1Score = lof.matrixScores(lof.confusionMatrix(labels2, predictedLabelsTest), "f1score")
print(testf1Score)
print(lof.confusionMatrix(labels2, predictedLabelsTest))

1.3459768096420501
0
[[4596  401]
 [   3    0]]
0.012396694214876032
[[4519  446]
 [  32    3]]


In [15]:
import math
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from numpy import random


# tuning is a function that splits the dataset into 4 folds and returns the predicted labels and actual labels for both the training and testing sets
# n times if n is the number of parameters. 
# parameters: data - feature values of dataset, labels - label values of dataset
# return: list of tuples containing fold, train data predicted and actual labels, and test data predicted and actual labels
def tuning3(features, labels):
    res = []
    skf = StratifiedKFold(n_splits=4)
    splits = skf.split(X=features, y=labels)
    
    for i, (train_index, test_index) in enumerate(splits):
        trainFeatures = [features[index] for index in train_index]
        trainLabels = [labels[index] for index in train_index]
        
        testFeatures = [features[index] for index in test_index]
        testLabels = [labels[index] for index in test_index]
        
        # Will return the predicted and actual labels for both training and testing sets
        lof = Local_Outlier_Factor(5, "euclidean", len(features))
        lof.createOutlierFactor(trainFeatures, 10)
        lof.findThreshold(trainLabels)
        predictedTrainLabels = lof.predict(trainFeatures, False, 10)
        predictedTestLabels = lof.predict(testFeatures, True, 10)

        res.append([i, (predictedTrainLabels, trainLabels), (predictedTestLabels, testLabels)])
    return res
   

    # runPipeline loads specified dataset in and passes the dataset into another function which will run ML classifiers on the dataset
    # return: None, prints out results from classifiers which are run through k-fold cross-validation and hyperparameter tuning
def pipeline3(dataSetName, classifier):
    if dataSetName == "credit_card":
        #file path to credit card csv file
        file_path = os.path.join(os.path.expanduser("~"), "Desktop", "archive(2)", "creditcard.csv")
        df = pd.read_csv(file_path) #read csv file as pandas object
        
        #Randomly gets 80000 examples out of the 284807 available examples
        randomIndices = random.randint(284807, size=(1000))
        data = df.take(randomIndices)

        data = np.array(data)
        
        fraud_instances = []
        for i in range(len(data)):
            if data[i][-1] == 1:
                fraud_instances.append(i)
        
        #prints out the number of fraud instances so we can ensure enough positive examples are in the dataset
        print("Number of positive examples: ", len(fraud_instances))
        
        # Gets features
        features = np.array([arr[:-1] for arr in data])
        
        #Gets labels
        labels = np.array(df["Class"][randomIndices])
    else: 
        print("Invalid data set, use one of the following data sets:")
        return

    # Local Outlier Factor Classifier
    if classifier == "local_outlier_factor":
        results = tuning3(features, labels)
        return results


In [16]:
results2 = pipeline3("credit_card", "local_outlier_factor")

Number of positive examples:  2




In [17]:
print(results2)

[[0, ([1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 