<a href="https://colab.research.google.com/github/Gust4voSales/duplicates-identification/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LOAD

In [1]:
import numpy as np
import pandas as pd

filename = 'Music_Brainz_20K.csv'

def get_database(filename=filename):
  df = pd.read_csv(filename)
  # df = df.loc[0:1000]  # Slices rows from start_row_label to end_row_label (inclusive)

  return df

# CLEAN DATABASE

## Clean functions

In [2]:
# clean strings
import re 

STRING_COLUMNS = ['title','artist','album']

# replace NaN with '' on STRING_COLUMNS
def clean_strings(df):
  df[STRING_COLUMNS] = df[STRING_COLUMNS].replace(np.nan, '')
  return df

# We don't remove non alphanumerics now, because we wouldn't be able to 
# undestand anything when we analyze the data. So we call this function 
# when we calculate the strings_distance
def remove_non_alphanum(string: str):
  return re.sub(r'\W+', '', string)

In [3]:
# clean years

def convert_year(year):
    if len(year) == 2:
        if int(year) < 50:
            return int('20' + year)
        else:
            return int('19' + year)
    elif len(year) == 4:
      return int(year)
    
    return 0

def clean_years(df):
  # parse years to the same format
  df['year'] = df['year'].replace(np.nan, '0')

  # remove all non-numerics from year column (example: '02 ==> 02)
  df['year'] = df['year'].str.replace(r'\D+', '', regex=True)

  df['year'] = df['year'].apply(lambda x: convert_year(x))

  return df

In [4]:
# remove unnecessary columns

UNUSED_COLUMNS = ['CTID', 'id',	'SourceID', 'length', 'language']
def remove_unused_columns(df):
  return df.drop(columns = UNUSED_COLUMNS)

## Clean everything

In [5]:
# a single function that calls all the above clean functions
def clean_db(df):
  df = remove_unused_columns(df)
  df = clean_strings(df)
  df = clean_years(df)
  return df

# DISTANCES FUNCTIONS

## Individual Attributes distances functions

In [6]:
# STRING distance (title, album and artist...)
import Levenshtein as lev

def string_distance(string1: str, string2: str):
  x = remove_non_alphanum(string1).lower()
  y = remove_non_alphanum(string2).lower()
  
  dist = lev.distance(x, y) 

  max_len = max(len(x), len(y))
  if (max_len == 0):
    return 0
  normalized = (max_len-dist) / max_len 
  normalized_dist = 1-normalized
  return normalized_dist

In [7]:
# year distance
def year_distance(year1: int, year2: int):
    if (year1==year2): 
      return 0
    return 1

In [8]:
# number_track distance
def number_track_distance(number_t1, number_t2):
  if str(number_t1).isnumeric() and str(number_t2).isnumeric():
    number_t1 = int(number_t1)
    number_t2 = int(number_t2)

  if(number_t1 == number_t2):
    return 0
  return 1

## Custom Distance function 

In [9]:
# custom distance function

title_w = 1.2
album_w = 1
artist_w = 1
number_track_w = .8
year_w = .5

def distance(item, item2):
  w_sum =   0

  title_dist = 0
  album_dist = 0
  artist_dist = 0
  year_dist = 0
  number_track_dist = 0
  
  if (item['title'] and item2['title']):
    title_dist = string_distance(item['title'], item2['title'])
    w_sum += title_w

  if (item['album'] and item2['album']):
    album_dist = string_distance(item['album'], item2['album'])
    w_sum += album_w

  if (item['artist'] and item2['artist']):
    artist_dist = string_distance(item['artist'], item2['artist'])
    w_sum += artist_w

  if (item['year']>0 and item2['year']>0):
    year_dist = year_distance(item['year'], item2['year'])
    w_sum += year_w

  if (str(item['number']).isnumeric() and str(item2['number']).isnumeric()):
    if (int(item['number'])>0 and int(item2['number'])>0):
      number_track_dist = number_track_distance(item['number'], item2['number'])
      w_sum += number_track_w

  if (w_sum == 0): # don't divide by 0, it could happen
    return 1.0
  else:
    dist = ( (title_dist*title_w) + (album_dist*album_w) + (artist_dist*artist_w)
    + (number_track_dist*number_track_w) + (year_dist*year_w) ) / w_sum
  return dist

# BLOCKING

# CLUSTERING

# Evaluating


In [10]:

def calculate_precision(df, clusters):
  total_precision = 0

  for centroid_TID in list(clusters.keys()):
    doubles_find = 0
    cluster_items_len = len(clusters[centroid_TID])
    
    centroid = clusters[centroid_TID][0]
    centroids_doubles_amount= len(df[df['CID'] == centroid['CID']])

    centroids_items = clusters[centroid_TID]

    for item in clusters[centroid_TID]:
      if(item['CID']==centroid['CID']):
        doubles_find+=1

    cluster_precision = doubles_find/centroids_doubles_amount
    total_precision+= cluster_precision
  
  return (total_precision/len(clusters))*100

# Testando

In [11]:
import time 
import sys
sys.path.append("..")
from modules.CustomKmeans import CustomKmeans 
from modules.PhonexStaticBlocking import PhonexStaticBlocking

def avaliate(filename, block_size):
  print(f"--LENDO A BASE: {filename}--")
  df = get_database(filename)
  
  print(f"  Limpando a base...")
  df = clean_db(df)

  print(f"  Tamanho da base: {len(df)}")
  print(f"--Blocando em blocks de {block_size} elementos--")
  blocker = PhonexStaticBlocking(df, 'title', block_size)
  blocks = blocker.get_blocks()
  print(f"  Quantidade de blocos: {len(blocks)}")

  clusters=[]

  print("--CLUSTERIZAÇÃO--")
  customKmeans = CustomKmeans(distanceFn=distance, uID='TID', threshold=0.4)
  start_time=time.time()
  for i,block in enumerate(blocks):
    print(' Clusterizando bloco ', i)
    returned_clusters = customKmeans.run(block)
    clusters.append(returned_clusters)
  end_time=time.time()
  print(' >> Tempo de execução da clusterização na base: ',end_time - start_time,'s')

  print("--AVALIAÇÃO--")
  precisions = []
  for i in range(len(clusters)):  
    block_precision = calculate_precision(blocks[i], clusters[i]) 
    base_precision = calculate_precision(df, clusters[i])
    precisions.append([block_precision, base_precision])

  precisions_df = pd.DataFrame(precisions, columns=['Relacao Bloco', 'Relacao Base Toda'])
  for i, precision in enumerate(precisions):
    print(f"  BLOCO {i} (teve {len(clusters[i])} clusters): Precisão em relação ao bloco: {precision[0]} %. -- Precisão em relação a base: {precision[1]} %")
    print()

  print("Relacao Bloco: ",(precisions_df['Relacao Bloco'].sum())/len(blocks),' %')
  print("Relacao Base Toda: ",(precisions_df['Relacao Base Toda'].sum())/len(blocks),' %') 

  print()
  print('------------------------------------------------------------------')
  print()
  return precisions_df, clusters




In [12]:
precisions_df, clusters = avaliate(filename='Music_Brainz_20K.csv', block_size=10)
# avaliate(filename='Music_Brainz_200K.csv', block_size=1000)
print('Precisão:')
print(precisions_df)
precisions_df.to_csv('precisao.csv')


--LENDO A BASE: Music_Brainz_20K.csv--
  Limpando a base...
  Tamanho da base: 19375
--Blocando em blocks de 10 elementos--
  Quantidade de blocos: 1938
--CLUSTERIZAÇÃO--
 Clusterizando bloco  0
RangeIndex(start=0, stop=1, step=1)
Index([0, 1], dtype='int64')
Index([0, 1, 2], dtype='int64')
Index([0, 1, 2, 3], dtype='int64')
Index([0, 1, 2, 3, 4], dtype='int64')
Index([0, 1, 2, 3, 4, 5], dtype='int64')
Index([0, 1, 2, 3, 4, 5, 6], dtype='int64')
Index([0, 1, 2, 3, 4, 5, 6, 7], dtype='int64')
Index([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype='int64')
Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')
 Clusterizando bloco  1
RangeIndex(start=0, stop=1, step=1)
Index([0, 1], dtype='int64')
Index([0, 1, 2], dtype='int64')
Index([0, 1, 2, 3], dtype='int64')
Index([0, 1, 2, 3, 4], dtype='int64')
Index([0, 1, 2, 3, 4, 5], dtype='int64')
Index([0, 1, 2, 3, 4, 5, 6], dtype='int64')
Index([0, 1, 2, 3, 4, 5, 6, 7], dtype='int64')
Index([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype='int64')
Index([0, 1, 2, 3, 4, 5,