In [1]:
import pandas as pd

# Distance function

In [2]:
# STRING distance (title, album and artist...)
import Levenshtein as lev

def string_distance(string1: str, string2: str):
  # x = remove_non_alphanum(string1).lower()
  # y = remove_non_alphanum(string2).lower()
  x = string1
  y = string2
  dist = lev.distance(x, y) 

  max_len = max(len(x), len(y))
  if (max_len == 0):
    return 0
  normalized = (max_len-dist) / max_len 
  normalized_dist = 1-normalized
  return normalized_dist

In [3]:
title_w = 1.2
album_w = 1
artist_w = 1
number_track_w = .8
year_w = .5

def distance(item: pd.Series, item2: pd.Series): 
  w_sum = 0

  title_dist = 0
  album_dist = 0
  artist_dist = 0
  
  if (item['title'] and item2['title']):
    title_dist = string_distance(item['title'], item2['title'])
    w_sum += title_w

  if (item['album'] and item2['album']):
    album_dist = string_distance(item['album'], item2['album'])
    w_sum += album_w

  if (item['artist'] and item2['artist']):
    artist_dist = string_distance(item['artist'], item2['artist'])
    w_sum += artist_w

  dist = ( (title_dist*title_w) + (album_dist*album_w) + (artist_dist*artist_w) ) / w_sum
  return dist

# Data creation


In [4]:
data = {
    'TID': ['1', '2', '3', '4', '5', '6', '7', '8'],
    'title': ['Boêhmian Rhapsody', 'Hotel California', 'Imagine', 'Shape of You', 'Rolling in the Deep', 'Imagine', 'Bohemian Rhapsody', 'Shape of You'],
    'artist': ['Queen', 'Eagles', 'John Lennon', 'Ed Sheran', 'Adele', 'John Lennon', 'Queen', 'Ed Sheeran'],
    'album': ['A Night at the Opera', 'Hotel California', 'Imagine', '÷', '21', 'Imagine Deluxe', 'Greatest Hits', '÷']
}
data2 = {
    'TID': ['x1', 'x2', 'x3'],
    'title': ['Boehmian Rhapsodyyy', 'Boehmian Rhapsody','Sabreu Pega no Breu'],
    'artist': ['Queen', 'Cover Banda', 'Mandioca'],
    'album': ['A Night at the Opera', 'Covers artisticos', 'Duo 2010',]
}
data3 = {
    'TID': ['y1', 'y2'],
    'title': ['A raposa e as uvas', 'Shap1e ofYou'],
    'artist': ['Reninaldo Rossi', 'Ed_sheran'],
    'album': ['Sucessos Vol3', '']
}
data4 = {
    'TID': ['z1'],
    'title': ['A raposa e as uvas', ],
    'artist': ['Reginaldo Rossi', ],
    'album': ['Sucessos Vol',]
}

df = pd.DataFrame(data)
df2 = pd.DataFrame(data2)
df3 = pd.DataFrame(data3)
df4 = pd.DataFrame(data4)

print(df)
print('\n~~~\n')
print(df2)
print('\n~~~\n')
print(df3)
print('\n~~~\n')
print(df4)

  TID                title       artist                 album
0   1    Boêhmian Rhapsody        Queen  A Night at the Opera
1   2     Hotel California       Eagles      Hotel California
2   3              Imagine  John Lennon               Imagine
3   4         Shape of You    Ed Sheran                     ÷
4   5  Rolling in the Deep        Adele                    21
5   6              Imagine  John Lennon        Imagine Deluxe
6   7    Bohemian Rhapsody        Queen         Greatest Hits
7   8         Shape of You   Ed Sheeran                     ÷

~~~

  TID                title       artist                 album
0  x1  Boehmian Rhapsodyyy        Queen  A Night at the Opera
1  x2    Boehmian Rhapsody  Cover Banda     Covers artisticos
2  x3  Sabreu Pega no Breu     Mandioca              Duo 2010

~~~

  TID               title           artist          album
0  y1  A raposa e as uvas  Reninaldo Rossi  Sucessos Vol3
1  y2        Shap1e ofYou        Ed_sheran               

~~~

  

# Helper functions

In [5]:
def print_clusters(clusters_blocks):
  for i, block_clusters in enumerate(clusters_blocks):
    print("CLUSTERS DO BLOCK ", i)
    for cluster_key in block_clusters.keys():
      print('>>>> CLUSTER: ' , cluster_key)
      for i, item in enumerate(block_clusters[cluster_key]):
        print('>>>> ',item['TID'], item['title'], item['artist'],  item['album'], )
      print()
    print('-'*100)

In [6]:
from modules.PhonexStaticBlocking import PhonexStaticBlocking
from modules.SoundexBlocking import SoundexBlocking

blocker = SoundexBlocking('title')

def block_data(df):
  blocks = blocker.generate_blocks(df)   
  
  return blocks

def print_blocks(blocks):
  for block in blocks:
      print(block, end='\n'*2) 

# Clustering

In [7]:
blocks = block_data(df)

In [8]:
from modules.CustomKmeans import CustomKmeans 

clusterized_blocks = []
customKmeans = CustomKmeans(distanceFn=distance, uID='TID', threshold=0.35)

for i, block in enumerate(blocks):
  clusters = customKmeans.run(block)
  clusterized_blocks.append(clusters)


In [9]:
# print_blocks(blocks)

In [10]:
print_clusters(clusterized_blocks)

CLUSTERS DO BLOCK  0
>>>> CLUSTER:  1
>>>>  1 Boêhmian Rhapsody Queen A Night at the Opera
>>>>  7 Bohemian Rhapsody Queen Greatest Hits

----------------------------------------------------------------------------------------------------
CLUSTERS DO BLOCK  1
>>>> CLUSTER:  2
>>>>  2 Hotel California Eagles Hotel California

----------------------------------------------------------------------------------------------------
CLUSTERS DO BLOCK  2
>>>> CLUSTER:  3
>>>>  3 Imagine John Lennon Imagine
>>>>  6 Imagine John Lennon Imagine Deluxe

----------------------------------------------------------------------------------------------------
CLUSTERS DO BLOCK  3
>>>> CLUSTER:  5
>>>>  5 Rolling in the Deep Adele 21

----------------------------------------------------------------------------------------------------
CLUSTERS DO BLOCK  4
>>>> CLUSTER:  4
>>>>  4 Shape of You Ed Sheran ÷
>>>>  8 Shape of You Ed Sheeran ÷

----------------------------------------------------------------------

# Incremental Clustering

In [11]:
blocks2 = block_data(df2) # block incremental data
# blocks_incremental = blocker.merge_blocks(blocks, blocks_incremental) # merge blocks

for block_incremental in blocks2:
  # find index of the clusters block
  clusters_block_index = blocker.get_row_block_index(block_incremental.iloc[0], blocks)
  
#   print(clusters_block_index)
  if (clusters_block_index > -1):
    clusters = customKmeans.run(block_incremental, clusterized_blocks[clusters_block_index])
  else:
    # new blocks, new cluster block
    clusters = customKmeans.run(block_incremental)
    clusterized_blocks.append(clusters)



In [12]:
print_clusters(clusterized_blocks)

CLUSTERS DO BLOCK  0
>>>> CLUSTER:  1
>>>>  1 Boêhmian Rhapsody Queen A Night at the Opera
>>>>  7 Bohemian Rhapsody Queen Greatest Hits
>>>>  x1 Boehmian Rhapsodyyy Queen A Night at the Opera

>>>> CLUSTER:  x2
>>>>  x2 Boehmian Rhapsody Cover Banda Covers artisticos

----------------------------------------------------------------------------------------------------
CLUSTERS DO BLOCK  1
>>>> CLUSTER:  2
>>>>  2 Hotel California Eagles Hotel California

----------------------------------------------------------------------------------------------------
CLUSTERS DO BLOCK  2
>>>> CLUSTER:  3
>>>>  3 Imagine John Lennon Imagine
>>>>  6 Imagine John Lennon Imagine Deluxe

----------------------------------------------------------------------------------------------------
CLUSTERS DO BLOCK  3
>>>> CLUSTER:  5
>>>>  5 Rolling in the Deep Adele 21

----------------------------------------------------------------------------------------------------
CLUSTERS DO BLOCK  4
>>>> CLUSTER:  4
>>>>  

In [13]:
blocks3 = block_data(df3) # block incremental data
# blocks_incremental = blocker.merge_blocks(blocks, blocks_incremental) # merge blocks

for block_incremental in blocks3:
  # find index of the clusters block
  clusters_block_index = blocker.get_row_block_index(block_incremental.iloc[0], blocks)
  
  # print(clusters_block_index)
  if (clusters_block_index > -1):
    clusters = customKmeans.run(block_incremental, clusterized_blocks[clusters_block_index])
  else:
    # new blocks, new cluster block
    clusters = customKmeans.run(block_incremental)
    clusterized_blocks.append(clusters)

In [14]:
print_blocks(blocks3)

  TID               title           artist          album blocking_key
0  y1  A raposa e as uvas  Reninaldo Rossi  Sucessos Vol3         A612

  TID         title     artist album blocking_key
1  y2  Shap1e ofYou  Ed_sheran               S100



In [15]:
print_clusters(clusterized_blocks)

CLUSTERS DO BLOCK  0
>>>> CLUSTER:  1
>>>>  1 Boêhmian Rhapsody Queen A Night at the Opera
>>>>  7 Bohemian Rhapsody Queen Greatest Hits
>>>>  x1 Boehmian Rhapsodyyy Queen A Night at the Opera

>>>> CLUSTER:  x2
>>>>  x2 Boehmian Rhapsody Cover Banda Covers artisticos

----------------------------------------------------------------------------------------------------
CLUSTERS DO BLOCK  1
>>>> CLUSTER:  2
>>>>  2 Hotel California Eagles Hotel California

----------------------------------------------------------------------------------------------------
CLUSTERS DO BLOCK  2
>>>> CLUSTER:  3
>>>>  3 Imagine John Lennon Imagine
>>>>  6 Imagine John Lennon Imagine Deluxe

----------------------------------------------------------------------------------------------------
CLUSTERS DO BLOCK  3
>>>> CLUSTER:  5
>>>>  5 Rolling in the Deep Adele 21

----------------------------------------------------------------------------------------------------
CLUSTERS DO BLOCK  4
>>>> CLUSTER:  4
>>>>  

# Evaluation

In [16]:
from modules.Evaluator import Evaluator

all_clusters = {}
for clusterized_block in clusterized_blocks:
  all_clusters.update(clusterized_block)

gold_standard = [
  # Bohemian Rhapsody duplicates
  ['1', '7'], 
  ['1', 'x1'],
  ['7', 'x1'],
  # Imagine
  ['3', '6'],
  # Shape of You
  ['4', '8'],
  ['4', 'y2'],
  ['8', 'y2'],
]

evaluator = Evaluator()
evaluator.calculate_metrics(all_clusters, gold_standard, 'TID')

print(evaluator.get_report())

~~ EVALUATION ~~
  Precision: 1.0
  Recall: 1.0
  F-measure: 1.0



# Ded

In [17]:
from modules.Deduplicator import Deduplicator

deduplicator = Deduplicator('title', distance, 'TID', 0.35)

clusters = deduplicator.run(df)
clusters = deduplicator.run(df2)
clusters = deduplicator.run(df3)
clusters = deduplicator.run(df4)

deduplicator.print_clusters_blocks()


CLUSTERS DO BLOCK  0
>>>> CLUSTER:  1
>>>>  1 Boêhmian Rhapsody Queen A Night at the Opera
>>>>  7 Bohemian Rhapsody Queen Greatest Hits
>>>>  x1 Boehmian Rhapsodyyy Queen A Night at the Opera

>>>> CLUSTER:  x2
>>>>  x2 Boehmian Rhapsody Cover Banda Covers artisticos

----------------------------------------------------------------------------------------------------
CLUSTERS DO BLOCK  1
>>>> CLUSTER:  2
>>>>  2 Hotel California Eagles Hotel California

----------------------------------------------------------------------------------------------------
CLUSTERS DO BLOCK  2
>>>> CLUSTER:  3
>>>>  3 Imagine John Lennon Imagine
>>>>  6 Imagine John Lennon Imagine Deluxe

----------------------------------------------------------------------------------------------------
CLUSTERS DO BLOCK  3
>>>> CLUSTER:  5
>>>>  5 Rolling in the Deep Adele 21

----------------------------------------------------------------------------------------------------
CLUSTERS DO BLOCK  4
>>>> CLUSTER:  4
>>>>  

In [18]:
gold_standard = [
  # Bohemian Rhapsody duplicates
  ['1', '7'], 
  ['1', 'x1'],
  ['7', 'x1'],
  # Imagine
  ['3', '6'],
  # Shape of You
  ['4', '8'],
  ['4', 'y2'],
  ['8', 'y2'],
  # A raposa e as uvas
  ['z1', 'y1']
]

deduplicator.evaluate(gold_standard, True)

[['1', '7'], ['1', 'x1'], ['7', 'x1']]
[['3', '6']]
[['4', '8'], ['4', 'y2'], ['8', 'y2']]
[['y1', 'z1']]
~~ EVALUATION ~~
  Precision: 1.0
  Recall: 1.0
  F-measure: 1.0

