In [1]:
import pandas as pd

filename = 'cd.csv'

def get_database(filename=filename):
  df = pd.read_csv(filename, delimiter=';', doublequote=False)
  # df = df.loc[0:1000]  # Slices rows from start_row_label to end_row_label (inclusive)
  return df

In [2]:
df = get_database()

# df.info

In [3]:
import re 
import numpy as np

STRING_COLUMNS = ['title','artist','track01']

# replace NaN with '' on STRING_COLUMNS
def clean_strings(df):
  df[STRING_COLUMNS] = df[STRING_COLUMNS].replace(np.nan, '')
  return df

# We don't remove non alphanumerics now, because we wouldn't be able to 
# undestand anything when we analyze the data. So we call this function 
# when we calculate the strings_distance
def remove_non_alphanum(string: str):
  return re.sub(r'\W+', '', string)

In [4]:
UNUSED_COLUMNS = ['id',	"category","genre","cdextra","year","track02","track03","track04","track05","track06","track07","track08","track09","track10","track11","track12","track13","track14","track15","track16","track17","track18","track19","track20","track21","track22","track23","track24","track25","track26","track27","track28","track29","track30","track31","track32","track33","track34","track35","track36","track37","track38","track39","track40","track41","track42","track43","track44","track45","track46","track47","track48","track49","track50","track51","track52","track53","track54","track55","track56","track57","track58","track59","track60","track61","track62","track63","track64","track65","track66","track67","track68","track69","track70","track71","track72","track73","track74","track75","track76","track77","track78","track79","track80","track81","track82","track83","track84","track85","track86","track87","track88","track89","track90","track91","track92","track93","track94","track95","track96","track97","track98","track99"]
def remove_unused_columns(df):
  return df.drop(columns = UNUSED_COLUMNS)

In [5]:
# a single function that calls all the above clean functions
def clean_db(df):
  df = remove_unused_columns(df) # TODO : try not cleaning (much data in memory) to see how to improve operations
  df = clean_strings(df)
  return df

In [6]:
# STRING distance (title, album and artist...)
import Levenshtein as lev

def string_distance(string1: str, string2: str):
  x = remove_non_alphanum(string1).lower()
  y = remove_non_alphanum(string2).lower()
  
  dist = lev.distance(x, y) 

  max_len = max(len(x), len(y))
  if (max_len == 0):
    return 0
  normalized = (max_len-dist) / max_len 
  normalized_dist = 1-normalized
  return normalized_dist

In [7]:
title_w = 1.2
artist_w = 1
track1_w = .8

def distance(item: pd.Series, item2: pd.Series): 
  w_sum = 0

  title_dist = 0
  artist_dist = 0
  track1_dist = 0
  
  if (item['title'] and item2['title']):
    title_dist = string_distance(item['title'], item2['title'])
    w_sum += title_w

  if (item['artist'] and item2['artist']):
    artist_dist = string_distance(item['artist'], item2['artist'])
    w_sum += artist_w

  if (item['track01'] and item2['track01']):
    track1_dist = string_distance(item['track01'], item2['track01'])
    w_sum += track1_w

  dist = ( (title_dist*title_w) + (artist_dist*artist_w) ) + (track1_dist*track1_w) / w_sum
  return dist

## TEST

In [8]:
import sys
sys.path.append("..")
from modules.PhonexStaticBlocking import PhonexStaticBlocking
from modules.SoundexBlocking import SoundexBlocking


df = clean_db(df)

print(len(df))

# blocker = PhonexStaticBlocking(df, 'title', 5) 
blocker = SoundexBlocking('title') 
# blocks = blocker.get_blocks()
blocks = blocker.generate_blocks(df)   

print(len(blocks))


9763
2628


In [21]:
from modules.CustomKmeans import CustomKmeans 

duplicates = 0
clusterized_blocks = []
all_clusters = {}

customKmeans = CustomKmeans(distanceFn=distance, uID='pk', threshold=0.25)
for i, block in enumerate(blocks):
  print('BLOCK ', i)

  clusters = customKmeans.run(block)
  clusterized_blocks.append(clusters)
  all_clusters.update(clusters)
  # for cluster_key in clusters.keys():
  #   print('CLUSTER: ' , cluster_key)

  #   if (len(clusters[cluster_key]) > 1):
  #     duplicates += len(clusters[cluster_key])-1
  #   for i, item in enumerate(clusters[cluster_key]):
  #     print(item['pk'], ' - ', item['title'], ' - ',item['artist'], ' - ', item['track01'], )
  # print('\n~~~~\n')

  # print(duplicates)

BLOCK  0
BLOCK  1
BLOCK  2
BLOCK  3
BLOCK  4
BLOCK  5
BLOCK  6
BLOCK  7
BLOCK  8
BLOCK  9
BLOCK  10
BLOCK  11
BLOCK  12
BLOCK  13
BLOCK  14
BLOCK  15
BLOCK  16
BLOCK  17
BLOCK  18
BLOCK  19
BLOCK  20
BLOCK  21
BLOCK  22
BLOCK  23
BLOCK  24
BLOCK  25
BLOCK  26
BLOCK  27
BLOCK  28
BLOCK  29
BLOCK  30
BLOCK  31
BLOCK  32
BLOCK  33
BLOCK  34
BLOCK  35
BLOCK  36
BLOCK  37
BLOCK  38
BLOCK  39
BLOCK  40
BLOCK  41
BLOCK  42
BLOCK  43
BLOCK  44
BLOCK  45
BLOCK  46
BLOCK  47
BLOCK  48
BLOCK  49
BLOCK  50
BLOCK  51
BLOCK  52
BLOCK  53
BLOCK  54
BLOCK  55
BLOCK  56
BLOCK  57
BLOCK  58
BLOCK  59
BLOCK  60
BLOCK  61
BLOCK  62
BLOCK  63
BLOCK  64
BLOCK  65
BLOCK  66
BLOCK  67
BLOCK  68
BLOCK  69
BLOCK  70
BLOCK  71
BLOCK  72
BLOCK  73
BLOCK  74
BLOCK  75
BLOCK  76
BLOCK  77
BLOCK  78
BLOCK  79
BLOCK  80
BLOCK  81
BLOCK  82
BLOCK  83
BLOCK  84
BLOCK  85
BLOCK  86
BLOCK  87
BLOCK  88
BLOCK  89
BLOCK  90
BLOCK  91
BLOCK  92
BLOCK  93
BLOCK  94
BLOCK  95
BLOCK  96
BLOCK  97
BLOCK  98
BLOCK  99
BLOCK  100

In [22]:
cluster_example = clusterized_blocks[1]
for cluster_key in cluster_example.keys():
    print('>>>> CLUSTER: ' , cluster_key)
    for i, item in enumerate(cluster_example[cluster_key]):
      print('>>>> ',item['pk'], item['title'], item['artist'],  item['track01'], )
    print()

>>>> CLUSTER:  2004
>>>>  2004 Áí ¹ìïõíá Ðáëéüðáéäï Ãéþñãïò Ôóáëßêçò Ìáæß óïõ êáé óôçí êüëáóç
>>>>  3710 Áí Þìïõíá ðáëéüðáéäï Ãéþñãïò Ôóáëßêçò Ìáæß óïõ êáé óôçí êüëáóç

>>>> CLUSTER:  100003
>>>>  100003 æƒ³ã?„å‡ºã?®ã?‹ã?‘ã‚‰ ä¸­æ?‘é›…ä¿Š ç›†å¸°ã‚Š

>>>> CLUSTER:  100525
>>>>  100525 æœˆã?®è?• é»’ç™¾å?ˆå§‰å¦¹ Num Kom, der Heiden Heiland

>>>> CLUSTER:  100726
>>>>  100726 æ?±äº¬ãƒ•ãƒ«ãƒ¼ãƒ„ ã?Ÿã?¾ å®‰å¿ƒ

>>>> CLUSTER:  101222
>>>>  101222 ãƒ•ãƒ«ãƒ ãƒ¼ãƒ³ãƒ»ã‚¢ãƒ³ãƒ‰ãƒ»ã‚¶ãƒ»ã‚·ãƒ¥ãƒ©ã‚¤ãƒ³ æ?¾å±…æ…¶å­? Night Hawkâ€™s Dream

>>>> CLUSTER:  101549
>>>>  101549 ãƒ™ã‚¹ãƒˆï¼†ãƒ™ã‚¹ãƒˆ ã?•ã? ã?¾ã?•ã?— ç²¾éœŠæµ?ã?—

>>>> CLUSTER:  101618
>>>>  101618 æ­Œã?§ã?—ã?‹è¨€ã?ˆã?ªã?„ ä¸­å³¶ã?¿ã‚†ã?? C.Q.

>>>> CLUSTER:  102325
>>>>  102325 àÊé¹¢Íº¿éÒ ¡Ñ¹µÐ ¡ÑÅÂì¨ÒÄ¡ 01 - ãËé¡Ñ¹ä´éäËÁà¸Í

>>>> CLUSTER:  102499
>>>>  102499 Â÷§O¤§«e ·¨¨¼©g Â÷§O¤§«e

>>>> CLUSTER:  102523
>>>>  102523 å½¼æ–¹ã?‹ã‚‰ã?®é¢¨ æ?‰å±±æ¸…è²´ å½¼æ–¹ã?‹ã‚‰ã?®é¢¨

>>>> CLUSTER:  102537
>>>>  102537 å¤?ã?®ã?¬ã?‘ã?Œã‚‰ çœŸå³¶æ˜Œåˆ© å

In [23]:
gold_standard_df = pd.read_csv('cd_gold.csv', delimiter=';')

gold_standard_pairs = gold_standard_df.values.tolist()
gold_standard_pairs

[[10230, 10383],
 [10253, 10368],
 [10578, 10628],
 [10714, 10715],
 [10785, 10833],
 [10938, 11125],
 [10983, 11174],
 [11006, 11008],
 [11088, 11190],
 [11423, 11538],
 [1818, 7075],
 [1823, 9257],
 [1828, 2110],
 [1838, 7934],
 [1858, 9637],
 [1866, 2530],
 [1884, 9542],
 [2004, 3710],
 [2026, 4601],
 [2139, 3296],
 [2139, 4271],
 [2187, 2418],
 [2188, 8063],
 [2252, 4733],
 [2317, 4988],
 [2366, 2367],
 [2406, 7291],
 [2427, 10314],
 [2427, 8640],
 [2440, 5443],
 [2571, 9019],
 [2577, 2581],
 [2619, 4520],
 [2619, 6174],
 [2707, 10414],
 [2771, 5458],
 [2781, 9669],
 [2807, 7064],
 [2813, 3155],
 [2819, 10064],
 [2823, 4684],
 [2890, 2891],
 [2950, 8135],
 [2957, 3106],
 [3086, 3215],
 [3096, 3123],
 [3125, 4140],
 [3134, 11531],
 [3134, 6210],
 [3159, 9969],
 [3185, 4631],
 [3192, 7923],
 [3222, 4451],
 [3296, 4271],
 [3356, 6955],
 [3383, 6969],
 [3395, 4563],
 [3395, 5122],
 [3475, 8592],
 [3517, 10061],
 [3529, 4114],
 [3539, 4190],
 [3670, 4441],
 [3670, 7515],
 [3670, 7769],


In [25]:
from modules.Evaluator import Evaluator

evaluator = Evaluator()
evaluator.calculate_metrics(all_clusters, gold_standard_pairs, 'pk')

print(evaluator.get_report())

[[2004, 3710]]
[[3529, 4114]]
[[3539, 4190]]
[[4613, 6116]]
[[6339, 7517]]
[[7061, 7469]]
[[2406, 7291]]
[[2440, 5443]]
[[4292, 8795]]
[[5426, 6761], [5426, 7869], [5426, 10361], [6761, 7869], [6761, 10361], [7869, 10361]]
[[8668, 8843]]
[[7918, 9172]]
[[4184, 6950]]
[[7361, 7750]]
[[8311, 8312]]
[[5577, 8990]]
[[4395, 8725]]
[[4843, 8794]]
[[10983, 11174]]
[[3356, 6955]]
[[7008, 8831]]
[[6382, 7920], [6382, 8494], [7920, 8494]]
[[4291, 4905]]
[[7346, 7719]]
[[5731, 8779]]
[[2781, 9669]]
[[9333, 9450]]
[[2577, 2581]]
[[2188, 8063]]
[[8224, 8754]]
[[9640, 10652]]
[[5477, 8368], [5477, 9076], [8368, 9076]]
[[4588, 4774]]
[[5360, 8430]]
[[4384, 5657]]
[[4282, 7549]]
[[7123, 9038]]
[[6568, 9557]]
[[5912, 8949]]
[[7898, 8422]]
[[100001, 100010]]
[[4601, 8410]]
[[4709, 4710]]
[[6858, 7852], [6858, 8029], [6858, 10230], [6858, 10383], [7852, 8029], [7852, 10230], [7852, 10383], [8029, 10230], [8029, 10383], [10230, 10383]]
[[102361, 103634]]
[[6925, 6926]]
[[5835, 9278]]
[[3475, 8592]]
[[6162