In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install shapely

Collecting shapely
  Downloading shapely-2.0.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Downloading shapely-2.0.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.5 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/2.5 MB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/2.5 MB[0m [31m17.8 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.5/2.5 MB[0m [31m30.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: shapely
Successfully installed shapely-2.0.6


In [None]:
import csv
import sys
from shapely import wkt
from shapely.geometry import GeometryCollection, Polygon, MultiPolygon
from collections import deque

# Setting the CSV field size limit
maxInt = sys.maxsize
while True:
    try:
        csv.field_size_limit(maxInt)
        break
    except OverflowError:
        maxInt = int(maxInt / 10)

class CsvReader:
    @staticmethod
    def readAllEntities(delimiter, inputFilePath, batch_size=1000):
        loadedEntities = []
        geoCollections = 0
        batch = []

        def process_batch(batch):
            local_entities = []
            local_geo_collections = 0

            for row in batch:
                found_geometry = False
                for column in row:
                    try:
                        geometry = wkt.loads(column)
                        found_geometry = True
                        break
                    except Exception:
                        continue
                if not found_geometry:
                    continue

                if isinstance(geometry, GeometryCollection):
                    local_geo_collections += 1
                else:
                    local_entities.append(geometry)

            return local_entities, local_geo_collections

        # Open and read the CSV file
        with open(inputFilePath, newline='') as f:
            reader = csv.reader(f, delimiter=delimiter)
            for row in reader:
                batch.append(row)
                if len(batch) >= batch_size:
                    entities, geo_collections = process_batch(batch)
                    loadedEntities.extend(entities)
                    geoCollections += geo_collections
                    batch = []

            # Process the remaining rows in the last batch
            if batch:
                entities, geo_collections = process_batch(batch)
                loadedEntities.extend(entities)
                geoCollections += geo_collections

        print(f"Total entities: {len(loadedEntities)}, Geometry collections: {geoCollections}")
        return loadedEntities

    @staticmethod
    def loadClusterDataToDeque(inputFilePath):
        """
        Loads cluster data from a CSV file into a deque.
        Assumes the CSV file contains two columns: cluster ID (int) and similarity index (float),
        and skips the header row.
        """
        cluster_data = deque()
        with open(inputFilePath, 'r') as csv_file:
            reader = csv.reader(csv_file)
            next(reader, None)  # Skip the header row
            for row in reader:
                cluster_id, similarity_index = int(row[0]), float(row[1])
                cluster_data.append((cluster_id, similarity_index))
        return cluster_data


In [None]:
import numpy as np
from shapely.geometry import Polygon
from shapely.affinity import translate
from shapely.set_operations import intersection_all, union_all
from shapely.strtree import STRtree

class ShapeSimilarity:

    def __init__(self):
        pass

    # Function to center polygons at the origin
    def center_polygons(self, polygons):
        centered_polygons = []
        for polygon in polygons:
            centroid = polygon.centroid
            centered_polygon = translate(polygon, xoff=-centroid.x, yoff=-centroid.y)
            centered_polygons.append(centered_polygon)
        return np.array(centered_polygons)

    # Precompute polygon properties and store them for efficiency
    def _precompute_properties(self, polygons):
        self.polygons = self.center_polygons(polygons)  # Center all polygons before computing properties
        self.num_polygons = len(self.polygons)

        # Precompute properties
        self.areas = np.array([polygon.area for polygon in self.polygons])
        self.perimeters = np.array([polygon.length for polygon in self.polygons])
        self.bboxes = np.array([polygon.bounds for polygon in self.polygons])
        self.fourier_descriptors = np.array([self._fourier_descriptor(polygon) for polygon in self.polygons])

    # Fourier Descriptor for a polygon
    def _fourier_descriptor(self, polygon, num_points=128):
        coords = np.array(polygon.exterior.coords)
        t = np.linspace(0, 1, len(coords))
        resampled_t = np.linspace(0, 1, num_points)
        resampled_coords = np.column_stack((
            np.interp(resampled_t, t, coords[:, 0]),
            np.interp(resampled_t, t, coords[:, 1])
        ))
        complex_coords = resampled_coords[:, 0] + 1j * resampled_coords[:, 1]
        fourier_transform = np.fft.fft(complex_coords)
        return np.abs(fourier_transform / np.abs(fourier_transform[1]))  # Normalize

    # Jaccard Similarity
    def jaccard_similarity(self, A, B):
        intersection_area = A.intersection(B).area
        union_area = A.union(B).area
        return intersection_area / union_area if union_area != 0 else 0

    # Area Similarity
    def area_similarity(self, idx_A, idx_B):
        intersection_area = self.polygons[idx_A].intersection(self.polygons[idx_B]).area
        return (2 * intersection_area) / (self.areas[idx_A] + self.areas[idx_B]) if self.areas[idx_A] + self.areas[idx_B] > 0 else 0

    # Curvature Similarity
    def curvature_similarity(self, idx_A, idx_B):
        num_vertices_A = len(self.polygons[idx_A].exterior.coords)
        num_vertices_B = len(self.polygons[idx_B].exterior.coords)
        return np.exp(-abs(num_vertices_A - num_vertices_B) / max(num_vertices_A, num_vertices_B))

    # Fourier Descriptor Similarity
    def fourier_descriptor_similarity(self, idx_A, idx_B):
        return 1 / (1 + np.linalg.norm(self.fourier_descriptors[idx_A] - self.fourier_descriptors[idx_B]))

    # Aspect Ratio Similarity
    def aspect_ratio_similarity(self, bbox_A, bbox_B):
        aspect_ratio_A = (bbox_A[2] - bbox_A[0]) / (bbox_A[3] - bbox_A[1]) if bbox_A[3] != bbox_A[1] else 0
        aspect_ratio_B = (bbox_B[2] - bbox_B[0]) / (bbox_B[3] - bbox_B[1]) if bbox_B[3] != bbox_B[1] else 0
        return 1 / (1 + abs(aspect_ratio_A - aspect_ratio_B))

    # Perimeter Similarity
    def perimeter_similarity(self, idx_A, idx_B):
        return 1 / (1 + abs(self.perimeters[idx_A] - self.perimeters[idx_B]))

    # Bounding Box Distance
    def bounding_box_distance(self, idx_A, idx_B):
        center_A = ((self.bboxes[idx_A][0] + self.bboxes[idx_A][2]) / 2, (self.bboxes[idx_A][1] + self.bboxes[idx_A][3]) / 2)
        center_B = ((self.bboxes[idx_B][0] + self.bboxes[idx_B][2]) / 2, (self.bboxes[idx_B][1] + self.bboxes[idx_B][3]) / 2)
        dist_centers = np.linalg.norm(np.array(center_A) - np.array(center_B))
        return 1 / (1 + dist_centers)

    # Polygon Circularity Similarity
    def polygon_circularity_similarity(self, idx_A, idx_B):
        circularity_A = (4 * np.pi * self.areas[idx_A]) / (self.perimeters[idx_A] ** 2) if self.perimeters[idx_A] != 0 else 0
        circularity_B = (4 * np.pi * self.areas[idx_B]) / (self.perimeters[idx_B] ** 2) if self.perimeters[idx_B] != 0 else 0
        return 1 / (1 + abs(circularity_A - circularity_B))

    # Combined similarity calculation for each unique pair
    def combined_similarity(self, idx_A, idx_B, w_jaccard=0.125, w_area=0.125, w_curvature=0.125, w_fourier=0.125,
                            w_aspect_ratio=0.125, w_perimeter=0.125, w_bbox=0.125, w_circularity=0.125):

        jaccard_sim = self.jaccard_similarity(self.polygons[idx_A], self.polygons[idx_B])
        area_sim = self.area_similarity(idx_A, idx_B)
        curvature_sim = self.curvature_similarity(idx_A, idx_B)
        fourier_sim = self.fourier_descriptor_similarity(idx_A, idx_B)
        aspect_ratio_sim = self.aspect_ratio_similarity(self.bboxes[idx_A], self.bboxes[idx_B])
        perimeter_sim = self.perimeter_similarity(idx_A, idx_B)
        bbox_dist = self.bounding_box_distance(idx_A, idx_B)
        circularity_sim = self.polygon_circularity_similarity(idx_A, idx_B)

        return (w_jaccard * jaccard_sim +
                w_area * area_sim +
                w_curvature * curvature_sim +
                w_fourier * fourier_sim +
                w_aspect_ratio * aspect_ratio_sim +
                w_perimeter * perimeter_sim +
                w_bbox * bbox_dist +
                w_circularity * circularity_sim) * 100

    # Calculate average similarity for all unique pairs
    def calculate_similarity_all_pairs(self, polygons_array):
        self._precompute_properties(polygons_array)
        similarity_scores = []

        for i in range(self.num_polygons):
            for j in range(i + 1, self.num_polygons):
                similarity_score = self.combined_similarity(i, j)
                similarity_scores.append(similarity_score)

        return np.mean(similarity_scores) if similarity_scores else 0


In [None]:
from shapely import relate
from shapely import Point, LineString, Polygon, MultiPoint, MultiLineString, MultiPolygon
from shapely import get_num_coordinates

class RelatedGeometries:
    def __init__(self):
        self.verifiedClusters = 0

        # Lists to store counts of geometries by similarity ranges (0-100%)
        self.similarity_0_10 = []
        self.similarity_10_20 = []
        self.similarity_20_30 = []
        self.similarity_30_40 = []
        self.similarity_40_50 = []
        self.similarity_50_60 = []
        self.similarity_60_70 = []
        self.similarity_70_80 = []
        self.similarity_80_90 = []
        self.similarity_90_100 = []

    # Methods to add geometries to the corresponding similarity range
    def addSimilarity(self, cluster, similarity):
        if 0 <= similarity < 10:
            self.similarity_0_10.append(cluster)
        elif 10 <= similarity < 20:
            self.similarity_10_20.append(cluster)
        elif 20 <= similarity < 30:
            self.similarity_20_30.append(cluster)
        elif 30 <= similarity < 40:
            self.similarity_30_40.append(cluster)
        elif 40 <= similarity < 50:
            self.similarity_40_50.append(cluster)
        elif 50 <= similarity < 60:
            self.similarity_50_60.append(cluster)
        elif 60 <= similarity < 70:
            self.similarity_60_70.append(cluster)
        elif 70 <= similarity < 80:
            self.similarity_70_80.append(cluster)
        elif 80 <= similarity < 90:
            self.similarity_80_90.append(cluster)
        elif 90 <= similarity <= 100:
            self.similarity_90_100.append(cluster)

    # Get counts of Clusters in each similarity range
    def getNoOfClustersInRange(self, lower_bound, upper_bound):
        if lower_bound == 0 and upper_bound == 10:
            return len(self.similarity_0_10)
        elif lower_bound == 10 and upper_bound == 20:
            return len(self.similarity_10_20)
        elif lower_bound == 20 and upper_bound == 30:
            return len(self.similarity_20_30)
        elif lower_bound == 30 and upper_bound == 40:
            return len(self.similarity_30_40)
        elif lower_bound == 40 and upper_bound == 50:
            return len(self.similarity_40_50)
        elif lower_bound == 50 and upper_bound == 60:
            return len(self.similarity_50_60)
        elif lower_bound == 60 and upper_bound == 70:
            return len(self.similarity_60_70)
        elif lower_bound == 70 and upper_bound == 80:
            return len(self.similarity_70_80)
        elif lower_bound == 80 and upper_bound == 90:
            return len(self.similarity_80_90)
        elif lower_bound == 90 and upper_bound == 100:
            return len(self.similarity_90_100)
        else:
            return 0

    def reset(self):
        self.verifiedClusters = 0

        # Clear similarity ranges
        self.similarity_0_10.clear()
        self.similarity_10_20.clear()
        self.similarity_20_30.clear()
        self.similarity_30_40.clear()
        self.similarity_40_50.clear()
        self.similarity_50_60.clear()
        self.similarity_60_70.clear()
        self.similarity_70_80.clear()
        self.similarity_80_90.clear()
        self.similarity_90_100.clear()

    def print(self):
        print("Clusters in 0-10% similarity range:\t", str(len(self.similarity_0_10)))
        print("Clusters in 10-20% similarity range:\t", str(len(self.similarity_10_20)))
        print("Clusters in 20-30% similarity range:\t", str(len(self.similarity_20_30)))
        print("Clusters in 30-40% similarity range:\t", str(len(self.similarity_30_40)))
        print("Clusters in 40-50% similarity range:\t", str(len(self.similarity_40_50)))
        print("Clusters in 50-60% similarity range:\t", str(len(self.similarity_50_60)))
        print("Clusters in 60-70% similarity range:\t", str(len(self.similarity_60_70)))
        print("Clusters in 70-80% similarity range:\t", str(len(self.similarity_70_80)))
        print("Clusters in 80-90% similarity range:\t", str(len(self.similarity_80_90)))
        print("Clusters in 90-100% similarity range:\t", str(len(self.similarity_90_100)))
        print("Verified Clusters", str(self.verifiedClusters))

    def verifyRelations(self, cluster, similarity):
        self.verifiedClusters += 1

        # Add the cluster to the corresponding similarity range
        if 0 <= similarity < 10:
            self.similarity_0_10.append(cluster)
        elif 10 <= similarity < 20:
            self.similarity_10_20.append(cluster)
        elif 20 <= similarity < 30:
            self.similarity_20_30.append(cluster)
        elif 30 <= similarity < 40:
            self.similarity_30_40.append(cluster)
        elif 40 <= similarity < 50:
            self.similarity_40_50.append(cluster)
        elif 50 <= similarity < 60:
            self.similarity_50_60.append(cluster)
        elif 60 <= similarity < 70:
            self.similarity_60_70.append(cluster)
        elif 70 <= similarity < 80:
            self.similarity_70_80.append(cluster)
        elif 80 <= similarity < 90:
            self.similarity_80_90.append(cluster)
        elif 90 <= similarity <= 100:
            self.similarity_90_100.append(cluster)

In [None]:
import math
import numpy as np
import sys
import time
import shapely
from collections import defaultdict
import os
from tqdm import tqdm
from multiprocessing import Pool, cpu_count
from tqdm import tqdm
import multiprocessing
#from utilities import CsvReader
#from datamodel import RelatedGeometries

class Similarity_Results:

    def __init__(self, delimiter: str, sourceFilePath: str, targetFilePath: str, outputDirectoryPath: str):

        # Load and process datasets using multiprocessing
        with multiprocessing.Pool(processes=2) as pool:
            results = pool.starmap(CsvReader.readAllEntities, [
                (delimiter, sourceFilePath),
                (delimiter, targetFilePath)
            ])

        # Store processed source and target data
        self.sourceData, self.targetData = results

        print('Source geometries:', len(self.sourceData))
        print('Target geometries:', len(self.targetData))

        self.relations = RelatedGeometries()
        self.similarity_calculator = ShapeSimilarity()
        self.spatialIndex = defaultdict(lambda: defaultdict(list))
        self.verifiedClusters = set()
        self.flag = [-1] * len(self.sourceData)
        self.frequency = [-1] * len(self.sourceData)
        self.output_dir = outputDirectoryPath

    def applyProcessing(self) :
      """
      Execute the complete algorithm pipeline:
      1. Index the source geometries.
      2. Verify all clusters and find their similarity index.
      """
      time1 = int(time.time() * 1000)
      self.setThetas()
      self.indexSource()
      time2 = int(time.time() * 1000)
      self.verification()
      time3 = int(time.time() * 1000)

      print("Indexing Time\t:\t" + str(time2 - time1))
      print("Verification Time\t:\t" + str(time3 - time2))
      self.relations.print()

    def indexSource(self) :
      """
      Index source geometries into a spatial grid for fast candidate lookup.
      Each geometry's bounding box is divided into grid cells.
      """
      geometryId = 0
      for sEntity in self.sourceData:
        self.addToIndex(geometryId, sEntity.bounds)
        geometryId += 1

    def addToIndex(self, geometryId, envelope) :
        """
        Adds a geometry to the spatial index.
        """
        maxX = math.ceil(envelope[2] / self.thetaX)
        maxY = math.ceil(envelope[3] / self.thetaY)
        minX = math.floor(envelope[0] / self.thetaX)
        minY = math.floor(envelope[1] / self.thetaY)
        for latIndex in range(minX, maxX+1):
          for longIndex in range(minY, maxY+1):
              self.spatialIndex[latIndex][longIndex].append(geometryId)


    def getCandidates(self, targetId):
        candidates = set()

        targetGeom = self.targetData[targetId]
        envelope = targetGeom.envelope.bounds
        maxX = math.ceil(envelope[2] / self.thetaX)
        maxY = math.ceil(envelope[3] / self.thetaY)
        minX = math.floor(envelope[0] / self.thetaX)
        minY = math.floor(envelope[1] / self.thetaY)

        for latIndex in range(minX, maxX+1):
          for longIndex in range(minY,maxY+1):
              for sourceId in self.spatialIndex[latIndex][longIndex]:
                  if (self.flag[sourceId] != targetId): #!!!!!!THIS LINE WAS DEBUGGED
                      self.flag[sourceId] = targetId
                      self.frequency[sourceId] = 0
                  self.frequency[sourceId] += 1
                  candidates.add(sourceId)

        return candidates


    def setThetas(self):
        """
        Compute average grid cell dimensions (thetaX, thetaY) based on source geometries.
        This determines the size of each spatial grid cell.
        """
        self.thetaX, self.thetaY = 0, 0
        for sEntity in self.sourceData:
            envelope = sEntity.envelope.bounds
            self.thetaX += envelope[2] - envelope[0]
            self.thetaY += envelope[3] - envelope[1]

        self.thetaX /= len(self.sourceData)
        self.thetaY /= len(self.sourceData)
        print("Dimensions of Equigrid", self.thetaX,"and", self.thetaY)


    def verification(self):

        """
        Performs cluster verification and calculate similarity metrics:
        - Computes average similarity for each target geometry's candidate matches (clusters).
        - Identifies target geometries with low similarity values (<=10).
        - Saves results and low-similarity target data to CSV files.
        """

        # Create the directory if it doesn't exist
        #os.makedirs(self.output_dir, exist_ok=True)

        # Initialize counters and data structures for verification
        totalDecisions, truePositiveDecisions, counter = len(self.verifiedClusters), 0, 0
        all_similarities = []  # List to store all average similarities
        similarity_data = []   # List to store target ID and its respective similarity
        zero_to_ten_similarity = []  # List to store target IDs with average similarity <= 10

        # Wrap the loop with tqdm for true positive calculations
        for targetId in tqdm(range(len(self.targetData)), desc="Calculating Recall", unit="prediction"):
            candidateMatches = self.getCandidates(targetId)
            if len(candidateMatches) > 1:
              # Map candidateMatches indexes to their corresponding polygons
              candidatePolygons = [self.sourceData[idx] for idx in candidateMatches]

              average_similarity = self.similarity_calculator.calculate_similarity_all_pairs(candidatePolygons)

              # Collect target IDs with average similarity <= 10
              if average_similarity <= 10:
                  zero_to_ten_similarity.append((targetId, average_similarity))

              # Append the average similarity to the list
              all_similarities.append(average_similarity)

              # Store targetId and its average similarity
              similarity_data.append((targetId, average_similarity))

              if self.relations.verifyRelations(counter, average_similarity):
                  #print("True Positive")
                  truePositiveDecisions += 1
            counter += 1

        # Calculate the overall average similarity
        overall_average_similarity = sum(all_similarities) / len(all_similarities) if all_similarities else 0
        print(f"Overall Average Similarity: {overall_average_similarity}")
        print("True Positive Decisions\t:\t" + str(truePositiveDecisions))

        # Paths for the CSV files in Google Drive
        csv_file = os.path.join(self.output_dir, "similarity_results.csv")
        zero_to_ten_csv_file = os.path.join(self.output_dir, "low_similarity_results.csv")

        # Write the target IDs and average similarities to a CSV file
        with open(csv_file, mode='w', newline='') as file:
            writer = csv.writer(file)
            # Write header row
            writer.writerow(["Cluster ID", "Average Similarity"])
            # Write data rows
            writer.writerows(similarity_data)

        print(f"Similarity results saved to {csv_file}")

        # Write the target IDs with average similarity <= 10 to another CSV file
        with open(zero_to_ten_csv_file, mode='w', newline='') as file:
            writer = csv.writer(file)
            # Write header row
            writer.writerow(["Cluster ID", "Average Similarity <= 10"])
            # Write data rows
            writer.writerows(zero_to_ten_similarity)

        print(f"Low similarity results saved to {zero_to_ten_csv_file}")




In [None]:
main_dir = '../content/drive/MyDrive/hpml_final_project/D1/'

sg = Similarity_Results(delimiter='\t',  sourceFilePath=main_dir + 'SourceDataset.csv', targetFilePath=main_dir + 'TargetDataset.csv', outputDirectoryPath = main_dir)
sg.applyProcessing()


Total entities: 583833, Geometry collections: 0
Total entities: 229276, Geometry collections: 0
Source geometries: 229276
Target geometries: 583833
Dimensions of Equigrid 0.009110405127444583 and 0.0068148931157206456


Calculating Recall: 100%|██████████| 583833/583833 [1:21:13<00:00, 119.79prediction/s]


Overall Average Similarity: 56.50621492994735
True Positive Decisions	:	0
Similarity results saved to ../content/drive/MyDrive/hpml_final_project/D1/similarity_results.csv
Low similarity results saved to ../content/drive/MyDrive/hpml_final_project/D1/low_similarity_results.csv
Indexing Time	:	11919
Verification Time	:	4874137
Clusters in 0-10% similarity range:	 0
Clusters in 10-20% similarity range:	 0
Clusters in 20-30% similarity range:	 68
Clusters in 30-40% similarity range:	 10864
Clusters in 40-50% similarity range:	 64553
Clusters in 50-60% similarity range:	 121251
Clusters in 60-70% similarity range:	 68206
Clusters in 70-80% similarity range:	 27246
Clusters in 80-90% similarity range:	 3215
Clusters in 90-100% similarity range:	 65
Verified Clusters 295481


In [None]:
main_dir = '../content/drive/MyDrive/hpml_final_project/D2/'

sg = Similarity_Results(delimiter='\t',  sourceFilePath=main_dir + 'SourceDataset.csv', targetFilePath=main_dir + 'TargetDataset.csv', outputDirectoryPath = main_dir)
sg.applyProcessing()


Total entities: 210483, Geometry collections: 0
Total entities: 2898899, Geometry collections: 0
Source geometries: 210483
Target geometries: 2898899
Dimensions of Equigrid 0.02034925282944467 and 0.01278916771425737


Calculating Recall: 100%|██████████| 2898899/2898899 [3:27:57<00:00, 232.34prediction/s]


Overall Average Similarity: 60.030136776775194
True Positive Decisions	:	0
Similarity results saved to ../content/drive/MyDrive/hpml_final_project/D2/similarity_results.csv
Low similarity results saved to ../content/drive/MyDrive/hpml_final_project/D2/low_similarity_results.csv
Indexing Time	:	8905
Verification Time	:	12478341
Clusters in 0-10% similarity range:	 0
Clusters in 10-20% similarity range:	 0
Clusters in 20-30% similarity range:	 0
Clusters in 30-40% similarity range:	 3299
Clusters in 40-50% similarity range:	 49966
Clusters in 50-60% similarity range:	 289280
Clusters in 60-70% similarity range:	 254385
Clusters in 70-80% similarity range:	 45820
Clusters in 80-90% similarity range:	 9243
Clusters in 90-100% similarity range:	 2181
Verified Clusters 654196


In [None]:
main_dir = '../content/drive/MyDrive/hpml_final_project/D3/'

sg = Similarity_Results(delimiter='\t',  sourceFilePath=main_dir + 'SourceDataset.csv', targetFilePath=main_dir + 'TargetDataset.csv', outputDirectoryPath = main_dir)
sg.applyProcessing()


Source geometries 200294
Target geometries 7392699
Source geometries: 200294
Target geometries: 7392699
Dimensions of Equigrid 0.0224352825786094 and 0.014045209125086408


Calculating Recall: 100%|██████████| 7392699/7392699 [21:42:24<00:00, 94.60prediction/s]  


Overall Average Similarity: 60.56998794994542
True Positive Decisions	:	1324980
Similarity results saved to ../content/drive/MyDrive/hpml_final_project/D3/similarity_results.csv
Low similarity results saved to ../content/drive/MyDrive/hpml_final_project/D3/low_similarity_results.csv
Indexing Time	:	9004
Verification Time	:	78147255
Clusters in 0-10% similarity range:	 0
Clusters in 10-20% similarity range:	 0
Clusters in 20-30% similarity range:	 0
Clusters in 30-40% similarity range:	 4143
Clusters in 40-50% similarity range:	 93512
Clusters in 50-60% similarity range:	 501648
Clusters in 60-70% similarity range:	 605463
Clusters in 70-80% similarity range:	 103164
Clusters in 80-90% similarity range:	 13195
Clusters in 90-100% similarity range:	 3582
Verified Clusters 1324980
