Initializing and cleaning the [Credit Card Fraud Detection Dataset](https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud?select=creditcard.csv) from download directory that was downloaded from Kaggle. 

In [2]:
import pandas as pd
import numpy as np
#file path to credit card csv file
file_path = os.path.join(os.path.expanduser("~"), "Desktop", "archive(2)", "creditcard.csv")
df = pd.read_csv(file_path) #read csv file as pandas object
CC_data = df.to_numpy() #CC_data will contain the Credit Card Fraud detection dataset as a numpy object
NaN_instances = []
#checking for NaN values in the dataset
for instance in CC_data:
    for datapoint in instance:
        if np.isnan(datapoint):
            NaN_instances.append(datapoint)
print(NaN_instances) #No NaN values so no need to clean up creditcard.csv")

[]


# Local Outlier Factor (LOF)
We will be using the local outlier factor algorithm to detect anomalies in our Credit Card Fraud Detection Dataset. To do this, we will be following these steps: 
<strong>distance:</strong>
- Define a Local Outlier Factor class that will hold data about a dataset such as the number of neighbors, distance measurement type, information about the datapoints' neighbors, and an array of local reachability factors for each datapoint. 

<strong>distance:</strong>
- Define a Euclidean distance function to calculate distance for use in KNN algorithm functions.

<strong>kNeighbors:</strong>
- Gets the k closest neighbors of a query point. Returns a tuple (distance, index) of the k closest neighbors. 
- The last element in the returned list is the kDistance.

<strong>reachabilityDistance:</strong>
- Compares the distance between a query point and the comparison point and the kDistance of the comparison point. 
- The kDistance of the comparison point is the distance between the comparison point and its kth neighbor
- Equation<br />

Reachability Distance = max(Distance from Point A to Point B, Distance to Kth Neighbor of Point B)

<strong>localReachabilityDensity:</strong>
- Define a Local Reachability Distance function for our LOF algorithm.
- Takes in a list tuples describing the neighbors of a query point that are stuctured like (distanceToNeighbor, index of neighbor). 
- Calculates the density of local datapoints points based on the distances of the data points surrounding it.
- Equation<br />

Reachability Density = 1 / Average Reachability Distance of Neighbors of a Point

<strong>localOutlierFactor:</strong>
- Define a Local Outlier Factor function to find the LOF values for query points. 
- Equation<br />

Local Outlier Factor = Average Reachability Density of Neighbors of a Point / Reachability Density of Point

<strong>getChunks:</strong>
- Splits up dataset for parallelization

<strong>parallel_method_call:</strong>
- Method that is called so that the kNeighbors calculations for each datapoint can be done in parallel

<strong>createOutlierFactor:</strong>
- Utilizes parallelization to find outlier factors for all datapoints. 

<strong>findThreshold:</strong>
- Finds the best Threshold according to the F1 score 




In [8]:
feature_CC= CC_data[0:1100]
fraud_instances = []
for i in range(len(feature_CC)):
    if feature_CC[i][-1] == 1:
        fraud_instances.append(i)
feature_CC = np.array([arr[:-1] for arr in feature_CC])  

In [13]:
import math
import numpy as np
from sklearn.neighbors import LocalOutlierFactor
from joblib import Parallel, delayed

class Local_Outlier_Factor:
    def __init__(self, n_neighbors, metric, datasize):
        self.k = n_neighbors
        self.distanceType = metric
        self.lofScores = []
        self.threshold = None
        self.neighborInfo = [0] * datasize  # contains list of tuples with information about neighbors for each datapoint [(neigbor1Distance, neighbor1Index), 
                                            # (neigbor2Distance, neighbor2Index) ...]

    def distance(self, a, b):
        if self.distanceType == "euclidean":
            try: 
                s = 0
                if type(a) != type(b) or len(a) != len(b):
                    raise Exception("The inputs a and b need to be same type and equal length") 
                for i in range(len(a)):
                    s += (a[i] - b[i])**2
                return math.sqrt(s)
            except Exception as e:
                print(f"Exception: {e}")
        elif self.distanceType == "manhattan":
            try: 
                s = 0
                if type(a) != type(b) or len(a) != len(b):
                    raise Exception("The inputs a and b need to be same type and equal length") 
                for i in range(len(a)):
                    s += abs(a[i] - b[i])
                return s
            except Exception as e:
                print(f"Exception: {e}")
        else:
            raise Exception("Please input a valid distance type into the LocalOutlierFactor object.")

    def kNeighbors(self, datapoints, query):
        neighbors = [0] * (self.k+1)
        neighbor = 0
        worstBest = float('inf')
        for i in range(len(datapoints)):
            d = self.distance(datapoints[i], query)
            if d < worstBest:
                if neighbor < self.k+1:
                    neighbors[neighbor] = (d, i)
                    neighbor += 1
                else:
                    maximum = float('-inf')
                    maxIndex = 0
                    for j in range(len(neighbors)):
                        if neighbors[j][0] > maximum:
                            maximum = neighbors[j][0]
                            maxIndex = j
                    worstBest = maximum
                    if d < worstBest:
                        neighbors[maxIndex] = (d, i)
        neighbors.sort()
        return neighbors[1:]

    def reachabilityDistance(self, distance, kDistance):
        return max(distance, kDistance)

    def localReachabilityDensity(self, datapoints, neighbors):
        foundNeighbors = set()
        reachabilityDistanceSum = 0
        # Calculates local reachability density
        for i in range(len(neighbors)):
            distanceToNeighbor = neighbors[i][0]
            neighborIndex = neighbors[i][1]
            reachabilityDistanceSum += self.reachabilityDistance(distanceToNeighbor, self.neighborInfo[neighborIndex][-1][0])
        avgReachabilityDistance = reachabilityDistanceSum / self.k
        return 1.0 / (avgReachabilityDistance + 1e-10)

    def localOutlierFactor(self, datapoints, query, neighbors):
        lrdQuery = self.localReachabilityDensity(datapoints, self.neighborInfo[query])
        
        lrdNeighborsSum = 0
        for i in range(len(neighbors)):
            neighborIndex = neighbors[i][1]
            lrdNeighborsSum += self.localReachabilityDensity(datapoints, self.neighborInfo[neighborIndex])
        avgLRDNeighbors = lrdNeighborsSum / self.k
        
        LOF = avgLRDNeighbors / lrdQuery
        return LOF
    
    # returns start and end indices of chunks of data
    # ex. [[0, 10], [10, 20]] if the number of datapoints is 20 and 2 chunks are needed
    def getChunks(self, numDatapoints, numChunks):
        chunkSize = numDatapoints//numChunks
        res = []
        startIndex = 0
        if numDatapoints%numChunks == 0: 
            while startIndex < numDatapoints:
                res.append([startIndex, startIndex+chunkSize])
                startIndex += chunkSize
        else:
            while startIndex < numDatapoints-chunkSize:
                res.append([startIndex, startIndex+chunkSize])
                startIndex += chunkSize
            res[-1][1] = numDatapoints
        return res
        
    def parallel_method_call(self, datapoints, indices):
        res = []
        for i in range(indices[0], indices[1]):
            res.append([i, self.kNeighbors(datapoints, datapoints[i])])
        return res
        
    def createOutlierFactor(self, datapoints, cores = 1):
        chunkIndices = self.getChunks(len(datapoints), cores) # indices of data for each core
        
        # Calculates information for each neighbor based on the n_neighbors specified within the Local_Outlier_Factor object.
        results = Parallel(n_jobs=-1)(delayed(self.parallel_method_call)(datapoints, chunkIndices[i]) for i in range(len(chunkIndices)))
        
        for i in range(len(results)):
            for j in range(len(results[i])):
                self.neighborInfo[results[i][j][0]] = results[i][j][1]
        
        # Calculates the local reachability factors for each datapoint and puts it within a list
        for i in range(len(datapoints)):
            self.lofScores.append(self.localOutlierFactor(datapoints, i, self.neighborInfo[i]))
        return np.array(self.lofScores)
    
    def findThreshold(self, trueLabels):
        if not self.lofScores:
            raise Exception("Please calculate local outlier factors first before trying to find a threshold value for them.")
        highestLOF = max(self.lofScores)
        curThreshold = min(self.lofScores)
        bestThresholdF1 = float('-inf')
        bestThreshold = curThreshold
        while bestThreshold < highestLOF:
            predicted = []
            for lof in self.lofScores:
                if lof < curThreshold:
                    predicted.append(1)
                else:
                    predicted.append(0)
            tn, fp, fn, tp = confusion_matrix(trueLabels, predicted).ravel()
            precision = tp/(tp+fp)
            recall = tp/(tp+fn)
            f1Score = 2 * ((precision * recall) / (precision + recall))
            if f1Score > bestThresholdF1:
                bestThresholdF1 = f1Score
                bestThreshold = curThreshold
            curThreshold += 0.1
        self.threshold = bestThreshold
    
    def predict(self, datapoints, newData, cores = 1):
        if newData:
            self.lofScores = []
            self.neighborInfo = [0] * len(datapoints)
            self.createOutlierFactor(datapoints, cores)
            predicted = []
            for lof in self.lofScores:
                if lof > self.threshold:
                    predicted.append(1)
                else:
                    predicted.append(0)
            return predicted
        else:
            predicted = []
            for lof in self.lofScores:
                if lof > self.threshold:
                    predicted.append(1)
                else:
                    predicted.append(0)
            return predicted
        
scikitLOF = LocalOutlierFactor(n_neighbors=5, metric="euclidean")
scikitLOF.fit_predict(feature_CC)
print(scikitLOF.negative_outlier_factor_)

myLOF = Local_Outlier_Factor(5, "euclidean", len(feature_CC))
print(myLOF.createOutlierFactor(feature_CC, 10))

[-1.14947773 -1.12016309 -1.60504217 ... -1.05122482 -1.09622799
 -1.07488108]
[1.14947773 1.12016309 1.60504217 ... 1.05122482 1.09622799 1.07488108]


In [14]:
print(np.mean([myLOF.lofScores[index] for index in fraud_instances]))
print(np.mean([myLOF.lofScores[index] for index in range(len(feature_CC))]))

1.0794735682820022
1.0767394204335707


In [16]:
from sklearn.metrics import confusion_matrix

# Getting predicted labels
y_pred = []
for lof in scikitLOF.negative_outlier_factor_:
    if lof < -1.7:
        y_pred.append(1)
    else:
        y_pred.append(0)

y_true = np.array(df["Class"][0:1100])

print(fraud_instances)
print(confusion_matrix(y_true, y_pred))
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
print("True Negative: ", tn)
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Positive: ", tp)
print("Precision: ", tp/(tp+fp))
print("Recall: ", tp/(tp+fn))

[541, 623]
[[1091    7]
 [   2    0]]
True Negative:  1091
False Positive:  7
False Negative:  2
True Positive:  0
Precision:  0.0
Recall:  0.0


In [18]:
# Threshold search
threshold = 0.9
y_true = np.array(df["Class"][0:1100])
maxPrecision = 0
maxPrecisionThreshold = 0
maxRecall = 0
maxRecallThreshold = 0
while threshold < 1.5:
    y_pred = []
    for lof in myLOF.lofScores:
        if lof > 1.0:
            y_pred.append(1)
        else:
            y_pred.append(0)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    if tp/(tp+fp) > maxPrecision:
        maxPrecision = tp/(tp+fp)
        maxPrecisionThreshold = threshold
    if tp/(tp+fn) > maxRecall:
        maxRecall = tp/(tp+fn)
        maxRecallThreshold = threshold
    threshold += 0.1
print(maxPrecision)
print(maxPrecisionThreshold)
print(maxRecall)
print(maxRecallThreshold)

0.0012547051442910915
0.9
0.5
0.9
