In [3]:
import pandas as pd

# Distance function

In [21]:
# STRING distance (title, album and artist...)
import Levenshtein as lev

def string_distance(string1: str, string2: str):
  # x = remove_non_alphanum(string1).lower()
  # y = remove_non_alphanum(string2).lower()
  x = string1
  y = string2
  dist = lev.distance(x, y) 

  max_len = max(len(x), len(y))
  if (max_len == 0):
    return 0
  normalized = (max_len-dist) / max_len 
  normalized_dist = 1-normalized
  return normalized_dist

In [22]:
title_w = 1.2
album_w = 1
artist_w = 1
number_track_w = .8
year_w = .5

def distance(item: pd.Series, item2: pd.Series): 
  w_sum = 0

  title_dist = 0
  album_dist = 0
  artist_dist = 0
  
  if (item['title'] and item2['title']):
    title_dist = string_distance(item['title'], item2['title'])
    w_sum += title_w

  if (item['album'] and item2['album']):
    album_dist = string_distance(item['album'], item2['album'])
    w_sum += album_w

  if (item['artist'] and item2['artist']):
    artist_dist = string_distance(item['artist'], item2['artist'])
    w_sum += artist_w

  dist = ( (title_dist*title_w) + (album_dist*album_w) + (artist_dist*artist_w) ) / w_sum
  return dist

# Data creation


In [1]:
data = {
    'TID': ['1', '2', '3', '4', '5', '6', '7', '8'],
    'title': ['Boêhmian Rhapsody', 'Hotel California', 'Imagine', 'Shape of You', 'Rolling in the Deep', 'Imagine', 'Bohemian Rhapsody', 'Shape of You'],
    'artist': ['Queen', 'Eagles', 'John Lennon', 'Ed Sheran', 'Adele', 'John Lennon', 'Queen', 'Ed Sheeran'],
    'album': ['A Night at the Opera', 'Hotel California', 'Imagine', '÷', '21', 'Imagine Deluxe', 'Greatest Hits', '÷']
}
df = pd.DataFrame(data)
print(df)

NameError: name 'pd' is not defined

# Blocking

In [24]:

from modules.PhonexStaticBlocking import PhonexStaticBlocking

blocker = PhonexStaticBlocking(df, 'title', 4)
blocks = blocker.get_blocks()


In [25]:
for block in blocks:
    print(block)
    print()

  TID              title      artist                 album  block_key
3   4       Shape of You   Ed Sheran                     ÷   0.210204
7   8       Shape of You  Ed Sheeran                     ÷   0.210204
0   1  Boehmian Rhapsody       Queen  A Night at the Opera   0.284223
6   7  Bohemian Rhapsody       Queen         Greatest Hits   0.284223

  TID                title       artist             album  block_key
1   2     Hotel California       Eagles  Hotel California   0.392022
2   3              Imagine  John Lennon           Imagine   0.435137
5   6              Imagine  John Lennon    Imagine Deluxe   0.435137
4   5  Rolling in the Deep        Adele                21   0.664270



# Clustering

In [26]:
from modules.CustomKmeans import CustomKmeans 

clusterized_blocks = []
customKmeans = CustomKmeans(distanceFn=distance, uID='TID', threshold=0.35)

for i, block in enumerate(blocks):
  print('CLUSTERIZAÇÃO DO BLOCK ', i)

  clusters = customKmeans.run(block)
  
  clusterized_blocks.append(clusters)

  for cluster_key in clusters.keys():
    print(' CLUSTER: ' , cluster_key)
    for i, item in enumerate(clusters[cluster_key]):
      print(' ',item['TID'], item['title'], item['artist'],  item['album'], )
    print()
  print('-----------------------------')


CLUSTERIZAÇÃO DO BLOCK  0
 CLUSTER:  4
  4 Shape of You Ed Sheran ÷
  8 Shape of You Ed Sheeran ÷

 CLUSTER:  1
  1 Boehmian Rhapsody Queen A Night at the Opera
  7 Bohemian Rhapsody Queen Greatest Hits

-----------------------------
CLUSTERIZAÇÃO DO BLOCK  1
 CLUSTER:  2
  2 Hotel California Eagles Hotel California

 CLUSTER:  3
  3 Imagine John Lennon Imagine
  6 Imagine John Lennon Imagine Deluxe

 CLUSTER:  5
  5 Rolling in the Deep Adele 21

-----------------------------


# Incremental Blocking

In [7]:
import unicodedata

x = 'Façanhú'



'Facanhu'

In [5]:
from modules.SoundexBlocking import SoundexBlocking

df_block2 = pd.DataFrame(data)

blocker = SoundexBlocking(df_block2, 'title')
blocks = blocker.get_blocks()

for block in blocks:
  print(block)
  print()

  TID              title artist                 album blocking_key
0   1  Boehmian Rhapsody  Queen  A Night at the Opera         B561
6   7  Bohemian Rhapsody  Queen         Greatest Hits         B561

  TID             title  artist             album blocking_key
1   2  Hotel California  Eagles  Hotel California         H342

  TID    title       artist           album blocking_key
2   3  Imagine  John Lennon         Imagine         I525
5   6  Imagine  John Lennon  Imagine Deluxe         I525

  TID                title artist album blocking_key
4   5  Rolling in the Deep  Adele    21         R452

  TID         title      artist album blocking_key
3   4  Shape of You   Ed Sheran     ÷         S100
7   8  Shape of You  Ed Sheeran     ÷         S100



# Incremental Clustering

In [28]:
data_incremental = {
    'TID': ['x1', 'x2', 'x3'],
    'title': ['Boehmian Rhapsodyyy', 'Boehmian Rhapsody','Sabreu Pega no Breu'],
    'artist': ['Queen', 'Cover Banda', 'Mandioca'],
    'album': ['A Night at the Opera', 'Covers artisticos', 'Duo 2010',]
}
df_incremental = pd.DataFrame(data_incremental)

In [29]:
clusters = customKmeans.run(df_incremental, clusterized_blocks[0])

for cluster_key in clusters.keys():
  print(' CLUSTER: ' , cluster_key)
  for i, item in enumerate(clusters[cluster_key]):
    print(' ',item['TID'], item['title'], item['artist'],  item['album'], )
  print()


 CLUSTER:  4
  4 Shape of You Ed Sheran ÷
  8 Shape of You Ed Sheeran ÷

 CLUSTER:  1
  1 Boehmian Rhapsody Queen A Night at the Opera
  7 Bohemian Rhapsody Queen Greatest Hits
  x1 Boehmian Rhapsodyyy Queen A Night at the Opera

 CLUSTER:  x2
  x2 Boehmian Rhapsody Cover Banda Covers artisticos

 CLUSTER:  x3
  x3 Sabreu Pega no Breu Mandioca Duo 2010

