In [27]:
import pandas as pd
import sys
import numpy as np
import json
sys.path.append("../")
from modules.Deduplicator import Deduplicator
from modules.utils.NormalizedLevenshtein import get_normalized_levenshtein_dist

GEO_DATASET_PATH = '../datasets/geographicalSettelments'

In [28]:
def read_db():
    # Read the JSON lines file
    with open(f'{GEO_DATASET_PATH}/settlements.json', 'r') as f:
        data = [json.loads(line) for line in f]

    # Process the data and create a DataFrame
    records = []
    for entry in data:
        entry_data = entry.get('data', {})
        lat = entry_data.get('lat', float('inf'))
        lon = entry_data.get('lon', float('inf'))
        # ontology = entry_data.get('ontology', None) # not used
        label = entry_data.get('label', None)
        id_ = entry.get('id', None)
        records.append({'id': id_, 'lat': lat, 'lon': lon, 'label': label})

    df = pd.DataFrame(records)
    return df

In [29]:
from itertools import combinations

def generate_golden_standard_array():
  # Read the JSON lines file
  with open(f'{GEO_DATASET_PATH}/combinedSettlements(PerfectMatch).json', 'r') as f:
      data = [json.loads(line) for line in f]

  # Process the data and create a DataFrame
  clusters = []
  for entry in data:
      entry_data = entry.get('data', {})
      clusteredVertices = entry_data.get('clusteredVertices')
      clusters.append(clusteredVertices)

  gold_standard_pairs = []
  for cluster in clusters:
    gold_standard_pairs.extend(combinations(cluster, 2))
  
  return gold_standard_pairs

# Distance 

In [30]:
import math

label_w = 2
lat_w = 1
lon_w =1
def distance(item: dict, item2: dict):
  w_sum = 0

  label_dist = 0
  lat_dist = 0
  lon_dist = 0

  label_dist = get_normalized_levenshtein_dist(item['label'], item2['label'])
  w_sum += label_dist

  if ((item['lat'] != float('inf')) and (item2['lat'] != float('inf'))):
    diff = abs(item['lat'] - item2['lat'])
    
    if diff <= 100:
      lat_dist = 0
    else:
      lat_dist = 1
    # if diff > 0 and diff <= 150:
    #   lat_dist = 0
    # elif diff <= 500:
    #   lat_dist = .2
    # else:
    #   lat_dist = 1

    w_sum += lat_dist
 
  if ((item['lon'] != float('inf')) and (item2['lon'] != float('inf'))):
    diff = abs(item['lon'] - item2['lon'])

    if diff <= 100:
      lon_dist = 0
    else:
      lon_dist = 1
    # if diff > 0 and diff <= 150:
    #   lon_dist = 0
    # elif diff <= 500:
    #   lon_dist = .2
    # else:
    #   lon_dist = 1

    w_sum += lon_dist
  
  if (w_sum == 0): # don't divide by 0, it could happen
    return 0
 
  dist = ( (label_dist*label_w) + (lat_dist*lat_w) + (lon_dist*lon_w) ) / w_sum
  return dist
  

# Experiment

In [31]:
# DATASET 
df = read_db()
golden_standard_array = generate_golden_standard_array()

print('Base toda', len(df))
print("Gabarito: ", golden_standard_array)

Base toda 3054
Gabarito:  [(0, 1), (0, 6478), (1, 6478), (2, 3), (2, 5407), (2, 3775), (3, 5407), (3, 3775), (5407, 3775), (4, 5), (4, 6557), (4, 2911), (5, 6557), (5, 2911), (6557, 2911), (6285, 6), (6285, 7), (6285, 6745), (6, 7), (6, 6745), (7, 6745), (7459, 12), (7459, 13), (7459, 2272), (12, 13), (12, 2272), (13, 2272), (3811, 3252), (3811, 14), (3811, 15), (3252, 14), (3252, 15), (14, 15), (7101, 6859), (7101, 25), (7101, 24), (6859, 25), (6859, 24), (25, 24), (3861, 7148), (3861, 30), (3861, 31), (7148, 30), (7148, 31), (30, 31), (32, 33), (32, 5716), (32, 2587), (33, 5716), (33, 2587), (5716, 2587), (38, 39), (38, 6680), (38, 3710), (39, 6680), (39, 3710), (6680, 3710), (2934, 42), (2934, 43), (2934, 913), (42, 43), (42, 913), (43, 913), (50, 51), (50, 2069), (50, 6798), (51, 2069), (51, 6798), (2069, 6798), (6876, 53), (6876, 52), (6876, 2921), (53, 52), (53, 2921), (52, 2921), (5455, 2070), (5455, 57), (5455, 56), (2070, 57), (2070, 56), (57, 56), (415, 59), (415, 5340), (415

In [32]:
# partitions
total_rows = len(df)
first_partition_rows = int(total_rows * 0.50) # 50%
second_partition_rows = int(total_rows * 0.35) # 35%
third_partition_rows = total_rows - first_partition_rows - second_partition_rows # 15%

# Generate random indices for shuffling
np.random.seed(42)
indices = np.random.permutation(total_rows)

# Split the indices into three partitions
first_partition_indices = indices[:first_partition_rows]
second_partition_indices = indices[first_partition_rows:first_partition_rows+second_partition_rows]
third_partition_indices = indices[first_partition_rows+second_partition_rows:]

# Create the three partitions
df1 = df.loc[first_partition_indices]
df2 = df.loc[second_partition_indices]
df3 = df.loc[third_partition_indices]


print('Base particao 1', len(df1))
print('Base particao 2', len(df2))
print('Base particao 3', len(df3))

Base particao 1 1527
Base particao 2 1068
Base particao 3 459


### BATCH

In [33]:
deduplicator = Deduplicator('label', distance, 'id', 0.15)

clusters = deduplicator.run(df)

In [34]:
deduplicator.evaluate(golden_standard_array)

~~ EVALUATION ~~
  Precision: 0.9733581164807931
  Recall: 0.3577772716920975
  F-measure: 0.5232306411323898

  TP: 1571
  FP: 43
  TN: -1111
  FN: 2820

Comparações:  6986


### Incremental

In [35]:
deduplicator_inc = Deduplicator('label', distance, 'id', 0.2)

In [36]:
clusters_inc = deduplicator_inc.run(df1)

In [37]:
clusters_inc = deduplicator_inc.run(df2)

In [38]:
clusters_inc = deduplicator_inc.run(df3)

In [39]:
deduplicator_inc.evaluate(golden_standard_array)

~~ EVALUATION ~~
  Precision: 0.9733581164807931
  Recall: 0.3577772716920975
  F-measure: 0.5232306411323898

  TP: 1571
  FP: 43
  TN: -1111
  FN: 2820

Comparações:  6767
