Initializing and cleaning the [Credit Card Fraud Detection Dataset](https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud?select=creditcard.csv) from download directory that was downloaded from Kaggle. 

In [3]:
import pandas as pd
import numpy as np

#file path to credit card csv file
file_path = os.path.join(os.path.expanduser("~"), "Downloads", "creditcard.csv")
df = pd.read_csv(file_path) #read csv file as pandas object
CC_data = df.to_numpy() #CC_data will contain the Credit Card Fraud detection dataset as a numpy object

0.0# Agglomerative Hierarchical Clustering

<strong>Main Idea:</strong> 
- Continously merge clusters that are similar to each other
- Once there are no clusters that are similar to one another, observe each defined cluster
- Clusters with very few number of datapoints are most likely to be deemed an anomaly

<strong>Similar Clusters:</strong>
- The similarity between clusters will be defined as the distance between clusters
- There will be three different different methods of calculating the distance between clusters: Single Linkage, Complete Linkage, Average Linkage
- Single Linkage: the distance between two clusters is defined as the smallest distance between two points in each cluster
- Complete Linkage: the distance between two clusters is defined as the longest distance between two points in each cluster
- Average Linkage: the distance between clusters is defined as the average distance between each point in one cluster to every point in the other cluster
- The two clusters with the smallest distance between them will be deemed most similar

<strong>Threshold Value:</strong>
- Now that we have our most similar clusters, we want to merge them only if the distance between them are less than a given threshold value
- We add this condition because eventually, the closest cluster might be an anomaly cluster, so we want to prevent merging with the anomaly cluster
- So if the algorithm ever reaches a point where the most similar cluster is a cluster extremely far away(more than our threshold value), we break out of the loop
       
<strong>Finding Outliers:</strong>
- We now have all our clusters
- We iterate through our clusters and we know that if cluster sizes are big, then that cluster has homogenous datapoints
- So clusters with a small size are more likely to be an anomaly
- In our algorithm, we say clusters with size 1 will be deemed an anomaly

  
  
<strong>Pseudocode:</strong><br>
Intialize n clusters in C (n is size of training set)<br>
while number of clusters is greater than 1:<br>
>for each cluster i in C:
>>for each cluster in C after current cluster j:
>>>find the distance between cluster i and j
>>>if distance is smallest we've calculated and is less than the threshold distance:
>>>>store the indexes of the cluster  

>if no indexes are stored, we know that closest distance does not meet threshold:
>>break


>merge the two clusters with closest distance

cluster sizes that are 1 can be deemed outliers

<strong>Hyperparamters:</strong>
- Threshold
- Distance Metric
- Linkage


In [23]:
import math
class AgglomerativeCluster:
    def __init__(self, data, distance_metric="euclidian", linkage="single", threshold= 5500):
        self.distance_metric = distance_metric
        self.linkage = linkage
        self.data = data
        self.threshold=threshold
        self.predictions = []
    def find_clusters(self):
        clusters = [{i} for i in range(len(self.data))] #create one cluster for each datapoint
        while len(clusters)>1: # continue running until all datapoints converge into one cluster
            min_distance = float('inf') #this will keep track of the distance of the two closest clusters
            index_clusters = [0,0] #the indexes of the closest clusters
            for i in range(len(clusters)): #iterate through each cluster
                for j in range(i+1, len(clusters)):
                    #find the the distance between two given clusters
                    cur_distance = self.distance_cluster(clusters, clusters[i] , clusters[j])
                    #if the distance is less than the minimum distance and it is within the threshold,
                    #store the minimum distance and the indices of those clusters
                    if cur_distance<min_distance and cur_distance<self.threshold:
                        index_clusters[0] = i
                        index_clusters[1] = j
                        min_distance = cur_distance
            #if no indices are stores, that means our minimum distance never was below the threshold, so break
            if index_clusters[0] == 0 and index_clusters[1]==0:
                break
            #merge the two most similar clusters
            clusters[index_clusters[0]].update(clusters[index_clusters[1]])
            del clusters[index_clusters[1]]
        predictions = [] #indices of clusters
        for i in range(len(clusters)):
            #all clusters that have one datapoint will be considered a cluster
            if len(clusters[i])<= 1:
                predictions.append(clusters[i])
        self.predictions= predictions
        return predictions
    
    def distance_cluster(self, clusters, cluster1, cluster2):
        #finds the minimum distance between two clusters
        if self.linkage == "single":
            minDistance = float('inf')
            for datapoint1 in cluster1:
                for datapoint2 in cluster2:
                    curDistance = self.find_distance(datapoint1, datapoint2)
                    if curDistance<minDistance:
                        minDistance = curDistance
            return minDistance
        #finds the maximum distance between two clusters
        elif self.linkage == "complete":
            maxDistance = float('-inf')
            for datapoint1 in cluster1:
                for datapoint2 in cluster2:
                    curDistance = self.find_distance(datapoint1, datapoint2)
                    if curDistance>maxDistance:
                        maxDistance = curDistance
            return maxDistance
        
        #finds the average distance between two clusters
        elif self.linkage == "average":
            totalDistance = 0
            for datapoint1 in cluster1:
                for datapoint2 in cluster2:
                    totalDistance += self.find_distance(datapoint1, datapoint2)
            return totalDistance/(len(cluster1)*len(cluster2))
        
    def find_distance(self, datapoint1, datapoint2):
        #calculates euclidian distance between two points
        if self.distance_metric == "euclidian":
            acc = 0
            for f in range(len(self.data[datapoint1])):
                acc+= (self.data[datapoint2][f]-self.data[datapoint1][f])**2
            return math.sqrt(acc)
        #calculates chebyshev distance
        if self.distance_metric == "chebyshev":
            acc = 0
            for f in range(len(self.data[datapoint1])):
                acc = max(acc, abs((self.data[datapoint2][f]-self.data[datapoint1][f])))
            return acc
        #calculates manhattan distance
        if self.distance_metric == "manhattan":
            for f in range(len(self.data[datapoint1])):
                acc+= abs(self.data[datapoint2][f]-self.data[datapoint1][f])
            return acc
    def recall(self, outliers, predictions):
        TruePositive = 0
        FalseNegative = 0
        for i in outliers:
            if i not in predictions:
                FalseNegative+=1
            if i in predictions:
                TruePositive+=1
        return TruePositive/(TruePositive+FalseNegative)
    def precision(self, outliers, predictions):
        TruePositive = 0
        FalsePositive = 0
        for i in predictions:
            if i not in outliers:
                FalsePositive+=1
            if i in predictions:
                TruePositive+=1
        return TruePositive/(TruePositive+FalsePositive)
    def f1Score(self, outliers, predictions):
        recall = self.recall(outliers, predictions)
        precision = self.precision(outliers, predictions)
        return 2*(precision*recall)/(precision+recall)
            
            

<strong>Parallelization:</strong>
- Because the Agglomerative Clustering algorithm has a time complexity of O(n^3), we need to parallelize the data so multiple models run at once

In [25]:
#function that will be called in parallel
from joblib import Parallel, delayed
def parallel_method_call(model):
    return model.find_clusters()


n_cores = 10  # Use 10 cores
chunk_size = 10000 // n_cores #each cores' data sizeAfter identifying the hyperparameters that produce the highest precision score from the Grid Search Function for the given training fold in the pipeline function, the pipeline function will use both the identified hyperparameters and the pipeline training fold to train the Isolation Forest.  The precision of the trained isolation forest is then calculated using the pipeline testing fold. This entire process of finding the best hyperparameters and applying it to the pipeline training 
all_models = [] #will keep all Agglomerative Clustering Models
partitioned_data = [] #will keep track of the partitioned data for each model(matching indices with 'all_models')

for i in range(10):
    feature_CC = CC_data[np.random.choice(CC_data.shape[0], chunk_size, replace=True)]
    original = feature_CC[:]
    partitioned_data.append(original)
    feature_CC = np.array([arr[:-1] for arr in feature_CC])  
    model = AgglomerativeCluster(feature_CC[:], linkage = "average", distance_metric="chebyshev")
    all_models.append(model)

#outliers will contain indices of true outliers of the dataset
#so the first indice i will contain which model we are referring to, and j will refer to which indice of the model i's dataset
outliers = []
for i in range (len(partitioned_data)):
    for j in range(len(partitioned_data[i])):
        if partitioned_data[i][j][-1]==1:
            outliers.append((i,j))
#call find clusters for each model in parallel and store the results in 'results'
#results will contain indices of the predicted anomaly for each model
results = Parallel(n_jobs=-1)(delayed(parallel_method_call)(model) for model in all_models)
print(results)
print(outliers)


KeyboardInterrupt: 

Calculating Recall and Precision

In [29]:
def calc_precision(results, outliers):
    acc = 0 
    TruePositive = 0
    FalsePositive = 0 
    for i in range(len(results)):
        if not results[i]:
            continue
        for j in range(len(results[i])):
            results[i][j] = list(results[i][j])
            if (i, results[i][j][0]) in outliers:
                TruePositive+=1
            else:
                FalsePositive+=1
    res = TruePositive/ (TruePositive+FalsePositive) 
    print("Precision: " + str(res))
    
    

def calc_recall(results, outliers):
    acc = 0 
    for i in range(len(results)):
        if not results[i]:
            continue
        for j in range(len(results[i])):
            results[i][j] = list(results[i][j])
            if (i, results[i][j][0]) in outliers:
                acc+=1
    acc = len(outliers) - acc
    res = len(outliers) / (len(outliers)+acc) 
    print("Recall: " + str(res))    

calc_precision(results, outliers)

Precision: 0.0
