In [1]:
import math
class AgglomerativeCluster:
    def __init__(self, data, distance_metric="euclidian", linkage="single", threshold= 4):
        self.distance_metric = distance_metric
        self.linkage = linkage
        self.data = data
        self.threshold=threshold
        self.outliers = []
    def find_clusters(self):
        clusters = [{i} for i in range(len(self.data))] #create one cluster for each datapoint
        while len(clusters)>1: # continue running until all datapoints converge into one cluster
            min_distance = float('inf') #this will keep track of the distance of the two closest clusters
            index_clusters = [0,0] #the indexes of the closest clusters
            for i in range(len(clusters)): #iterate through each cluster
                for j in range(i+1, len(clusters)):
                    #find the the distance between two given clusters
                    cur_distance = self.distance_cluster(clusters, clusters[i] , clusters[j])
                    #if the distance is less than the minimum distance and it is within the threshold,
                    #store the minimum distance and the indices of those clusters
                    if cur_distance<min_distance and cur_distance<self.threshold:
                        index_clusters[0] = i
                        index_clusters[1] = j
                        min_distance = cur_distance
            #if no indices are stores, that means our minimum distance never was below the threshold, so break
            if index_clusters[0] == 0 and index_clusters[1]==0:
                break
            #merge the two most similar clusters
            clusters[index_clusters[0]].update(clusters[index_clusters[1]])
            del clusters[index_clusters[1]]
        outliers = [] #indices of clusters
        for i in range(len(clusters)):
            #all clusters that have one datapoint will be considered a cluster
            if len(clusters[i])<= 1:
                outliers.append(clusters[i])
        self.outliers = outliers
        #return outliers
        return outliers
    
    def distance_cluster(self, clusters, cluster1, cluster2):
        #finds the minimum distance between two clusters
        if self.linkage == "single":
            minDistance = float('inf')
            for datapoint1 in cluster1:
                for datapoint2 in cluster2:
                    curDistance = self.find_distance(datapoint1, datapoint2)
                    if curDistance<minDistance:
                        minDistance = curDistance
            return minDistance
        #finds the maximum distance between two clusters
        elif self.linkage == "complete":
            maxDistance = float('-inf')
            for datapoint1 in cluster1:
                for datapoint2 in cluster2:
                    curDistance = self.find_distance(datapoint1, datapoint2)
                    if curDistance>maxDistance:
                        maxDistance = curDistance
            return maxDistance
        
        #finds the average distance between two clusters
        elif self.linkage == "average":
            totalDistance = 0
            for datapoint1 in cluster1:
                for datapoint2 in cluster2:
                    totalDistance += self.find_distance(datapoint1, datapoint2)
            return totalDistance/(len(cluster1)*len(cluster2))
        
    def find_distance(self, datapoint1, datapoint2):
        #calculates euclidian distance between two points
        if self.distance_metric == "euclidian":
            acc = 0
            for f in range(len(self.data[datapoint1])):
                acc+= (self.data[datapoint2][f]-self.data[datapoint1][f])**2
            return math.sqrt(acc)
        #calculates chebyshev distance
        if self.distance_metric == "chebyshev":
            acc = 0
            for f in range(len(self.data[datapoint1])):
                acc = max(acc, abs((self.data[datapoint2][f]-self.data[datapoint1][f])))
            return acc
        #calculates manhattan distance
        if self.distance_metric == "manhattan":
            acc=0
            for f in range(len(self.data[datapoint1])):
                acc+= abs(self.data[datapoint2][f]-self.data[datapoint1][f])
            return acc
    
    def recall(self, outliers, predictions):
        TruePositive = 0
        FalseNegative = 0
        for i in outliers:
            if i not in predictions:
                FalseNegative+=1
            if i in predictions:
                TruePositive+=1
        return TruePositive/(TruePositive+FalseNegative)
    def precision(self, outliers, predictions):
        TruePositive = 0
        FalsePositive = 0
        for i in predictions:
            if i not in outliers:
                FalsePositive+=1
            if i in predictions:
                TruePositive+=1
        return TruePositive/(TruePositive+FalsePositive)
    def f1Score(self, outliers, predictions):
        recall = self.recall(outliers, predictions)
        precision = self.precision(outliers, predictions)
        return 2*(precision*recall)/(precision+recall)
            
            