In [1]:
import pandas as pd

filename = 'cd.csv'

def get_database(filename=filename):
  df = pd.read_csv(filename, delimiter=';', doublequote=False)
  # df = df.loc[0:1000]  # Slices rows from start_row_label to end_row_label (inclusive)
  return df

In [2]:
df = get_database()

# df.info

In [3]:
import re 
import numpy as np

STRING_COLUMNS = ['title','artist','track01']

# replace NaN with '' on STRING_COLUMNS
def clean_strings(df):
  df[STRING_COLUMNS] = df[STRING_COLUMNS].replace(np.nan, '')
  return df

# We don't remove non alphanumerics now, because we wouldn't be able to 
# undestand anything when we analyze the data. So we call this function 
# when we calculate the strings_distance
def remove_non_alphanum(string: str):
  return re.sub(r'\W+', '', string)

In [4]:
UNUSED_COLUMNS = ['id',	"category","genre","cdextra","year","track02","track03","track04","track05","track06","track07","track08","track09","track10","track11","track12","track13","track14","track15","track16","track17","track18","track19","track20","track21","track22","track23","track24","track25","track26","track27","track28","track29","track30","track31","track32","track33","track34","track35","track36","track37","track38","track39","track40","track41","track42","track43","track44","track45","track46","track47","track48","track49","track50","track51","track52","track53","track54","track55","track56","track57","track58","track59","track60","track61","track62","track63","track64","track65","track66","track67","track68","track69","track70","track71","track72","track73","track74","track75","track76","track77","track78","track79","track80","track81","track82","track83","track84","track85","track86","track87","track88","track89","track90","track91","track92","track93","track94","track95","track96","track97","track98","track99"]
def remove_unused_columns(df):
  return df.drop(columns = UNUSED_COLUMNS)

In [5]:
# a single function that calls all the above clean functions
def clean_db(df):
  df = remove_unused_columns(df) # TODO : try not cleaning (much data in memory) to see how to improve operations
  df = clean_strings(df)
  return df

In [6]:
# STRING distance (title, album and artist...)
import Levenshtein as lev

def string_distance(string1: str, string2: str):
  x = remove_non_alphanum(string1).lower()
  y = remove_non_alphanum(string2).lower()
  
  dist = lev.distance(x, y) 

  max_len = max(len(x), len(y))
  if (max_len == 0):
    return 0
  normalized = (max_len-dist) / max_len 
  normalized_dist = 1-normalized
  return normalized_dist

In [7]:
title_w = 1.2
artist_w = 1
track1_w = .8

def distance(item: pd.Series, item2: pd.Series): 
  w_sum = 0

  title_dist = 0
  artist_dist = 0
  track1_dist = 0
  
  if (item['title'] and item2['title']):
    title_dist = string_distance(item['title'], item2['title'])
    w_sum += title_w

  if (item['artist'] and item2['artist']):
    artist_dist = string_distance(item['artist'], item2['artist'])
    w_sum += artist_w

  if (item['track01'] and item2['track01']):
    track1_dist = string_distance(item['track01'], item2['track01'])
    w_sum += track1_w

  dist = ( (title_dist*title_w) + (artist_dist*artist_w) ) + (track1_dist*track1_w) / w_sum
  return dist

## TEST

In [8]:
import sys
sys.path.append("..")
from modules.PhonexStaticBlocking import PhonexStaticBlocking

df = clean_db(df)

print(len(df))

blocker = PhonexStaticBlocking(df, 'title', 5) 
blocks = blocker.get_blocks()

print(len(blocks))


9763
1953


In [9]:
from modules.CustomKmeans import CustomKmeans 

duplicates = 0
for i, block in enumerate(blocks):
  print('BLOCK ', i)
  
  customKmeans = CustomKmeans(distanceFn=distance, uID='pk', threshold=0.35)

  clusters = customKmeans.run(block)
  
  for cluster_key in clusters.keys():
    print('CLUSTER: ' , cluster_key)

    if (len(clusters[cluster_key]) > 1):
      duplicates += len(clusters[cluster_key])-1
    for i, item in enumerate(clusters[cluster_key]):
      print(item['pk'], ' - ', item['title'], ' - ',item['artist'], ' - ', item['track01'], )
  print('\n~~~~\n')

  print(duplicates)

BLOCK  0
CLUSTER:  106087
106087  -    -  Squab Teen  -  Entrada A La Salida
CLUSTER:  104653
104653  -  ][  -  no more lies  -  stone, noise, broken glasses
CLUSTER:  100270
100270  -  2001  -  Dara Bubamara  -  01dvojnica
CLUSTER:  106359
106359  -  5  -  Lenny Kravitz  -  Live
CLUSTER:  105735
105735  -  X  -  Robert Gawlinski  -  dlugi spacer

~~~~

0
BLOCK  1
CLUSTER:  100067
100067  -  8th  -  ¿©Çà_½ºÄÉÄ¡  -  01_¿ØÁö_´À³¦ÀÌ_ÁÁ¾Æ
CLUSTER:  106790
106790  -  97-99  -  Division of Laura Lee  -  Guess My Name
CLUSTER:  101287
101287  -    -  Spiel Acid Jazz Band  -  Beep Beep Be Bop!
CLUSTER:  106846
106846  -  3000  -  Ñìûñëîâûå Ãàëëþöèíàöèè  -  Âå÷íî ìîëîäîé
CLUSTER:  108531
108531  -  '97  -  Sailor  -  Girls Girls Girls

~~~~

0
BLOCK  2
CLUSTER:  103320
103320  -  3.4.1  -  Wiches  -  Horror Museum
CLUSTER:  107195
107195  -  89-94  -  Sing-Sing  -  Életfogytig Rock&Roll
CLUSTER:  102997
102997  -    -  Sexteto Tango  -  Sexteto Tango - Instrumental / A Media Luz
CLUSTER:  10332