In [4]:
import pandas as pd
import sys
sys.path.append("../")
from experiments.utils.cd_information import distance, clean_db
from modules.ProxCluster import ProxCluster
from modules.PhonexStaticBlocking import PhonexStaticBlocking
from modules.SoundexBlocking import SoundexBlocking
from modules.Evaluator import Evaluator

CD_DATASET_PATH = '../datasets/cd_information'

# Experiment

In [5]:
df = pd.read_csv(f'{CD_DATASET_PATH}/cd.csv', delimiter=';', doublequote=False)

# DATASET 
df = clean_db(df)

# used for evaluation
gold_standard_df = pd.read_csv(f'{CD_DATASET_PATH}/cd_gold.csv', delimiter=';')
gold_standard_pairs = gold_standard_df.values.tolist()
gold_standard_pairs = [tuple(row) for row in gold_standard_pairs]

print('Base toda', len(df))

Base toda 9763


In [6]:
prox_cluster = ProxCluster(distance, 'pk', 0.25)

In [7]:
# helper functions
def run_proxcluster(blocks):
  all_clusters = {}

  for block in blocks:
    clusters = prox_cluster.run(block)

    all_clusters.update(clusters)
  return all_clusters


def evaluate_clusters(clusters):
  evaluator = Evaluator()
  evaluator.calculate_metrics(clusters, gold_standard_pairs, 'pk')

  print(evaluator.get_report()) 

### PhonexStaticBlocking

#### BlockSize: 50

In [8]:
blocker = PhonexStaticBlocking(df, 'title', 50)
blocks = blocker.get_blocks()

print(f"Quantidade de blocos: {len(blocks)}")

Quantidade de blocos: 196


In [15]:
clusters = run_proxcluster(blocks)

In [11]:
evaluate_clusters(clusters)

~~ EVALUATION ~~
  Precision: 0.9629629629629629
  Recall: 0.782608695652174
  F-measure: 0.8634686346863469

  TP: 234
  FP: 9
  TN: -1111
  FN: 65



#### BlockSize: 25

In [16]:
blocker = PhonexStaticBlocking(df, 'title', 25)
blocks = blocker.get_blocks()

print(f"Quantidade de blocos: {len(blocks)}")

Quantidade de blocos: 391


In [17]:
clusters = run_proxcluster(blocks)

In [18]:
evaluate_clusters(clusters)

~~ EVALUATION ~~
  Precision: 0.9617021276595744
  Recall: 0.7558528428093646
  F-measure: 0.846441947565543

  TP: 226
  FP: 9
  TN: -1111
  FN: 73



#### BlockSize: 15

In [19]:
blocker = PhonexStaticBlocking(df, 'title', 15)
blocks = blocker.get_blocks()

print(f"Quantidade de blocos: {len(blocks)}")

Quantidade de blocos: 651


In [20]:
clusters = run_proxcluster(blocks)

In [21]:
evaluate_clusters(clusters)

~~ EVALUATION ~~
  Precision: 0.9694323144104804
  Recall: 0.7424749163879598
  F-measure: 0.8409090909090908

  TP: 222
  FP: 7
  TN: -1111
  FN: 77



#### BlockSize: 5

In [28]:
blocker = PhonexStaticBlocking(df, 'title', 5)
blocks = blocker.get_blocks()

print(f"Quantidade de blocos: {len(blocks)}")

Quantidade de blocos: 1953


In [29]:
clusters = run_proxcluster(blocks)

In [24]:
evaluate_clusters(clusters)

~~ EVALUATION ~~
  Precision: 0.9613259668508287
  Recall: 0.5819397993311036
  F-measure: 0.7249999999999999

  TP: 174
  FP: 7
  TN: -1111
  FN: 125



### SoundexBlocking

In [25]:
blocker = SoundexBlocking('title')
blocks = blocker.generate_blocks(df)

print(f"Quantidade de blocos: {len(blocks)}")

Quantidade de blocos: 2628


In [26]:
all_clusters = {}

for block in blocks:
  clusters = prox_cluster.run(block)

  all_clusters.update(clusters)

In [27]:
# evaluation 
evaluator = Evaluator()
evaluator.calculate_metrics(all_clusters, gold_standard_pairs, 'pk')

print(evaluator.get_report()) 

~~ EVALUATION ~~
  Precision: 0.963855421686747
  Recall: 0.802675585284281
  F-measure: 0.8759124087591241

  TP: 240
  FP: 9
  TN: -1111
  FN: 59

