In [1]:
import pandas as pd

In [2]:
# STRING distance (title, album and artist...)
import Levenshtein as lev

def string_distance(string1: str, string2: str):
  # x = remove_non_alphanum(string1).lower()
  # y = remove_non_alphanum(string2).lower()
  x = string1
  y = string2
  dist = lev.distance(x, y) 

  max_len = max(len(x), len(y))
  if (max_len == 0):
    return 0
  normalized = (max_len-dist) / max_len 
  normalized_dist = 1-normalized
  return normalized_dist

In [3]:
title_w = 1.2
album_w = 1
artist_w = 1
number_track_w = .8
year_w = .5

def distance(item: pd.Series, item2: pd.Series): 
  w_sum = 0

  title_dist = 0
  album_dist = 0
  artist_dist = 0
  
  if (item['title'] and item2['title']):
    title_dist = string_distance(item['title'], item2['title'])
    w_sum += title_w

  if (item['album'] and item2['album']):
    album_dist = string_distance(item['album'], item2['album'])
    w_sum += album_w

  if (item['artist'] and item2['artist']):
    artist_dist = string_distance(item['artist'], item2['artist'])
    w_sum += artist_w

  dist = ( (title_dist*title_w) + (album_dist*album_w) + (artist_dist*artist_w) ) / w_sum
  return dist

In [4]:
data = {
    'TID': ['1', '2', '3', '4', '5', '6', '7', '8'],
    'title': ['Boehmian Rhapsody', 'Hotel California', 'Imagine', 'Shape of You', 'Rolling in the Deep', 'Imagine', 'Bohemian Rhapsody', 'Shape of You'],
    'artist': ['Queen', 'Eagles', 'John Lennon', 'Ed Sheran', 'Adele', 'John Lennon', 'Queen', 'Ed Sheeran'],
    'album': ['A Night at the Opera', 'Hotel California', 'Imagine', '÷', '21', 'Imagine Deluxe', 'Greatest Hits', '÷']
}
df = pd.DataFrame(data)
print(df)

  TID                title       artist                 album
0   1    Boehmian Rhapsody        Queen  A Night at the Opera
1   2     Hotel California       Eagles      Hotel California
2   3              Imagine  John Lennon               Imagine
3   4         Shape of You    Ed Sheran                     ÷
4   5  Rolling in the Deep        Adele                    21
5   6              Imagine  John Lennon        Imagine Deluxe
6   7    Bohemian Rhapsody        Queen         Greatest Hits
7   8         Shape of You   Ed Sheeran                     ÷


In [5]:

from modules.PhonexStaticBlocking import PhonexStaticBlocking

blocker = PhonexStaticBlocking(df, 'title', 10)
blocks = blocker.get_blocks()


In [6]:
for block in blocks:
    print(block)
    print()

  TID                title       artist                 album  block_key
3   4         Shape of You    Ed Sheran                     ÷   0.210204
7   8         Shape of You   Ed Sheeran                     ÷   0.210204
0   1    Boehmian Rhapsody        Queen  A Night at the Opera   0.284223
6   7    Bohemian Rhapsody        Queen         Greatest Hits   0.284223
1   2     Hotel California       Eagles      Hotel California   0.392022
2   3              Imagine  John Lennon               Imagine   0.435137
5   6              Imagine  John Lennon        Imagine Deluxe   0.435137
4   5  Rolling in the Deep        Adele                    21   0.664270



In [7]:
from modules.CustomKmeans import CustomKmeans 

customKmeans = CustomKmeans(distanceFn=distance, uID='TID', threshold=0.35)

for i, block in enumerate(blocks):
  print('CLUSTERIZAÇÃO DO BLOCK ', i)

  clusters, blocks_keys = customKmeans.run(block)
  print(blocks_keys)
  for cluster_key in clusters.keys():
    print(' CLUSTER: ' , cluster_key)
    for i, item in enumerate(clusters[cluster_key]):
      print(' ',item['TID'], item['title'], item['artist'],  item['album'], )
    print()
  print('-----------------------------')


CLUSTERIZAÇÃO DO BLOCK  0
(0.21020423877959135, 0.6642703960935376)
 CLUSTER:  4
  4 Shape of You Ed Sheran ÷
  8 Shape of You Ed Sheeran ÷

 CLUSTER:  1
  1 Boehmian Rhapsody Queen A Night at the Opera
  7 Bohemian Rhapsody Queen Greatest Hits

 CLUSTER:  2
  2 Hotel California Eagles Hotel California

 CLUSTER:  3
  3 Imagine John Lennon Imagine
  6 Imagine John Lennon Imagine Deluxe

 CLUSTER:  5
  5 Rolling in the Deep Adele 21

-----------------------------


In [8]:
data = {
    'TID': ['x1', 'x2'],
    'title': ['Boehmian Rhapsodyyy', 'Sabreu Pega no Breu'],
    'artist': ['Queen', 'Mandioca'],
    'album': ['A Night at the Opera', 'Duo 2010',]
}
df_incremental = pd.DataFrame(data)

In [9]:
clusters, blocks_keys = customKmeans.run(df_incremental)

print(blocks_keys)
for cluster_key in clusters.keys():
  print(' CLUSTER: ' , cluster_key)
  for i, item in enumerate(clusters[cluster_key]):
    print(' ',item['TID'], item['title'], item['artist'],  item['album'], )
  print()


(0.21020423877959135, 0.6642703960935376)
 CLUSTER:  4
  4 Shape of You Ed Sheran ÷
  8 Shape of You Ed Sheeran ÷

 CLUSTER:  1
  1 Boehmian Rhapsody Queen A Night at the Opera
  7 Bohemian Rhapsody Queen Greatest Hits
  x1 Boehmian Rhapsodyyy Queen A Night at the Opera

 CLUSTER:  2
  2 Hotel California Eagles Hotel California

 CLUSTER:  3
  3 Imagine John Lennon Imagine
  6 Imagine John Lennon Imagine Deluxe

 CLUSTER:  5
  5 Rolling in the Deep Adele 21

 CLUSTER:  x2
  x2 Sabreu Pega no Breu Mandioca Duo 2010

