<a href="https://colab.research.google.com/github/Gust4voSales/duplicates-identification/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LOAD

In [None]:
!pip install Levenshtein phonex

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Levenshtein
  Downloading Levenshtein-0.20.9-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (175 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.5/175.5 KB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting phonex
  Downloading phonex-0.0.2-py3-none-any.whl (3.9 kB)
Collecting rapidfuzz<3.0.0,>=2.3.0
  Downloading rapidfuzz-2.13.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m72.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: phonex, rapidfuzz, Levenshtein
Successfully installed Levenshtein-0.20.9 phonex-0.0.2 rapidfuzz-2.13.7


In [None]:
import numpy as np
import pandas as pd

filename = 'Music_Brainz_20K.csv'

def get_database(filename=filename):
  df = pd.read_csv(filename)
  return df

# CLEAN DATABASE

## Clean functions

In [None]:
# clean strings
import re 

STRING_COLUMNS = ['title','artist','album']

# replace NaN with '' on STRING_COLUMNS
def clean_strings(df):
  df[STRING_COLUMNS] = df[STRING_COLUMNS].replace(np.nan, '')
  return df

# We don't remove non alphanumerics now, because we wouldn't be able to 
# undestand anything when we analyze the data. So we call this function 
# when we calculate the strings_distance
def remove_non_alphanum(string: str):
  return re.sub(r'\W+', '', string)

In [None]:
# clean years

def convert_year(year):
    if len(year) == 2:
        if int(year) < 50:
            return int('20' + year)
        else:
            return int('19' + year)
    elif len(year) == 4:
      return int(year)
    
    return 0

def clean_years(df):
  # parse years to the same format
  df['year'] = df['year'].replace(np.nan, '0')

  # remove all non-numerics from year column (example: '02 ==> 02)
  df['year'] = df['year'].str.replace(r'\D+', '', regex=True)

  df['year'] = df['year'].apply(lambda x: convert_year(x))

  return df

In [None]:
# remove unnecessary columns

UNUSED_COLUMNS = ['CTID', 'id',	'SourceID', 'length', 'language']
def remove_unused_columns(df):
  return df.drop(columns = UNUSED_COLUMNS)

## Clean everything

In [None]:
# a single function that calls all the above clean functions
def clean_db(df):
  df = remove_unused_columns(df)
  df = clean_strings(df)
  df = clean_years(df)
  return df

# DISTANCES FUNCTIONS

## Individual Attributes distances functions

In [None]:
# STRING distance (title, album and artist...)
import Levenshtein as lev

def string_distance(string1: str, string2: str):
  x = remove_non_alphanum(string1).lower()
  y = remove_non_alphanum(string2).lower()
  
  dist = lev.distance(x, y) 

  max_len = max(len(x), len(y))
  if (max_len == 0):
    return 0
  normalized = (max_len-dist) / max_len 
  normalized_dist = 1-normalized
  return normalized_dist

In [None]:
# year distance
def year_distance(year1: int, year2: int):
    if (year1==year2): 
      return 0
    return 1

In [None]:
# number_track distance
def number_track_distance(number_t1, number_t2):
  if str(number_t1).isnumeric() and str(number_t2).isnumeric():
    number_t1 = int(number_t1)
    number_t2 = int(number_t2)

  if(number_t1 == number_t2):
    return 0
  return 1

## Custom Distance function 

In [None]:
# custom distance function

title_w = 1.2
album_w = 1
artist_w = 1
number_track_w = .8
year_w = .5

def distance(item, rows):
  distances = []
  
  for _, row in rows.iterrows():
    w_sum = 0

    title_dist = 0
    album_dist = 0
    artist_dist = 0
    year_dist = 0
    number_track_dist = 0
    
    if (item['title'] and row['title']):
      title_dist = string_distance(item['title'], row['title'])
      w_sum += title_w

    if (item['album'] and row['album']):
      album_dist = string_distance(item['album'], row['album'])
      w_sum += album_w

    if (item['artist'] and row['artist']):
      artist_dist = string_distance(item['artist'], row['artist'])
      w_sum += artist_w

    if (item['year']>0 and row['year']>0):
      year_dist = year_distance(item['year'], row['year'])
      w_sum += year_w

    if (str(item['number']).isnumeric() and str(row['number']).isnumeric()):
      if (int(item['number'])>0 and int(row['number'])>0):
        number_track_dist = number_track_distance(item['number'], row['number'])
        w_sum += number_track_w

    if (w_sum == 0): # don't divide by 0, it could happen
      distances.append(1)
    else:
      dist = ( (title_dist*title_w) + (album_dist*album_w) + (artist_dist*artist_w)
      + (number_track_dist*number_track_w) + (year_dist*year_w) ) / w_sum
      distances.append(dist)

  return np.array(distances) 

# BLOCKING

In [None]:
# Function that adds a title_phonex column to the database
from phonex import phonex

TITLE_PHONEX_COLUMN_NAME = 'title_phonex'

def calculate_phonex(string):
  string = ''.join(filter(str.isalpha, string))
  return phonex(string)

def add_title_phonex_column(df, column_name=TITLE_PHONEX_COLUMN_NAME):
  df[TITLE_PHONEX_COLUMN_NAME] = df['title'].apply(calculate_phonex)
  return df

In [None]:
class Blocker:
  def __init__(self, df, MAX_BLOCK_SIZE):
    self.df = df
    self.blocks = []
    self.MAX_BLOCK_SIZE = MAX_BLOCK_SIZE

  def add_title_phonex(self):
    df = add_title_phonex_column(self.df, TITLE_PHONEX_COLUMN_NAME)
    df = df.sort_values(TITLE_PHONEX_COLUMN_NAME, ascending=True) # sort by title_phonex 

    self.df = df

  # def divide_base_into_blocks(self,start_range, end_range):
  def divide_base_into_blocks(self):
    num_blocks = 0
    block_size = self.MAX_BLOCK_SIZE
    if(len(self.df) % block_size==0):
      num_blocks = len(self.df) // block_size
    else:
      num_blocks = len(self.df) // block_size + 1

    # divide the dataframe into blocks of block_size rows each
    self.blocks = [self.df[i:i+block_size] for i in range(0, num_blocks*block_size, block_size)]
        
  def get_blocks(self):
    return self.blocks


def generate_blocks(df, MAX_BLOCK_SIZE=1000):
  b = Blocker(df, MAX_BLOCK_SIZE)
  b.add_title_phonex()
  b.divide_base_into_blocks()
  blocks = b.get_blocks()
  
  return blocks


# CLUSTERING

In [None]:
# helper function that get an array of TIDs and returns the items rows 
def get_items_by_TID(df, TIDs):
  items = []

  for TID in TIDs:
    item = df.loc[df['TID'] == TID]
    items.append(item)
    
  return pd.concat(items,  axis=0)

In [None]:
# CLUSTER with custom KMEANS

def custom_KMEANS(df, threshold=0.4):
  first_el = df.iloc[0]
  centroids_TIDs = [ first_el['TID'] ] # use first item as the first centroid
  
  # clusters (É um dicionário, a chave do dicionário é o TID do centroide, seu valor é um array de items)
  clusters = {key: [] for key in centroids_TIDs} 

  centroids = get_items_by_TID(df, centroids_TIDs) # getting the centroids rows by their TIDs
  
  for index, el in df.iterrows(): 
    
    dists = distance(el, centroids) # calculating the distance from the current element to the centroids, returns --> [distance_to_1st_cent, distance_to_2nd_cent]
    centroid_index_with_min_dist = np.argmin(dists)# get the index of the centroid with the minimum distance to the current element
    
    if (dists[centroid_index_with_min_dist] < threshold):
      min_centroid_TID = centroids_TIDs[centroid_index_with_min_dist] # get the centroid TID from the index  
      clusters[min_centroid_TID].append(el) # Append the current element to that centroid
    else:
      new_centroid_TID = el['TID']
      centroids_TIDs.append(new_centroid_TID)

      centroids.loc[index] = el # add current element as centroid 
      clusters[new_centroid_TID] = [el] # Append the current element to that centroid
  
  return clusters

# Evaluating


In [None]:
def calculate_precision(df, clusters):
  total_precision = 0

  for centroid_TID in list(clusters.keys()):
    doubles_find = 0
    cluster_items_len = len(clusters[centroid_TID])
    
    centroid = clusters[centroid_TID][0]
    centroids_doubles_amount= len(df[df['CID'] == centroid['CID']])

    centroids_items = clusters[centroid_TID]

    for item in clusters[centroid_TID]:
      if(item['CID']==centroid['CID']):
        doubles_find+=1

    cluster_precision = doubles_find/centroids_doubles_amount
    total_precision+= cluster_precision
  
  return (total_precision/len(clusters))*100

# Testando

In [None]:
import time 

def avaliate(filename, block_size):
  print(f"--LENDO A BASE: {filename}--")
  df = get_database(filename)
  
  print(f"  Limpando a base...")
  df = clean_db(df)

  print(f"  Tamanho da base: {len(df)}")
  print(f"--Blocando em blocks de {block_size} elementos--")
  blocks = generate_blocks(df, block_size)
  print(f"  Quantidade de blocos: {len(blocks)}")

  clusters=[]

  print("--CLUSTERIZAÇÃO--")
  start_time=time.time()
  for i,block in enumerate(blocks):
    print(' Clusterizando bloco ', i)
    clusters.append(custom_KMEANS(block))
  end_time=time.time()
  print(' >> Tempo de execução da clusterização na base: ',end_time - start_time,'s')

  print("--AVALIAÇÃO--")
  precisions = []
  for i in range(len(clusters)):  
    block_precision = calculate_precision(blocks[i], clusters[i]) 
    base_precision = calculate_precision(df, clusters[i])
    precisions.append([block_precision, base_precision])

  precisions_df = pd.DataFrame(precisions, columns=['Relacao Bloco', 'Relacao Base Toda'])
  for i, precision in enumerate(precisions):
    print(f"  BLOCO {i} (teve {len(clusters[i])} clusters): Precisão em relação ao bloco: {precision[0]} %. -- Precisão em relação a base: {precision[1]} %")
    print()

  print("Relacao Bloco: ",(precisions_df['Relacao Bloco'].sum())/len(blocks),' %')
  print("Relacao Base Toda: ",(precisions_df['Relacao Base Toda'].sum())/len(blocks),' %') 

  print()
  print('------------------------------------------------------------------')
  print()
  return precisions_df




In [None]:
precisions_df = avaliate(filename='Music_Brainz_200K.csv', block_size=250)
# avaliate(filename='Music_Brainz_200K.csv', block_size=1000)
print('Precisão:')
print(precisions_df)
precisions_df.to_csv('precisao.csv')


--LENDO A BASE: Music_Brainz_200K.csv--
  Limpando a base...
  Tamanho da base: 193750
--Blocando em blocks de 250 elementos--
  Quantidade de blocos: 775
--CLUSTERIZAÇÃO--
 Clusterizando bloco  0
250
 Clusterizando bloco  1
250
 Clusterizando bloco  2
250
 Clusterizando bloco  3
250
 Clusterizando bloco  4
250
 Clusterizando bloco  5
250
 Clusterizando bloco  6
250
 Clusterizando bloco  7
250
 Clusterizando bloco  8
250
 Clusterizando bloco  9
250
 Clusterizando bloco  10
250
 Clusterizando bloco  11
250
 Clusterizando bloco  12
250
 Clusterizando bloco  13
250
 Clusterizando bloco  14
250
 Clusterizando bloco  15
250
 Clusterizando bloco  16
250
 Clusterizando bloco  17
250
 Clusterizando bloco  18
250
 Clusterizando bloco  19
250
 Clusterizando bloco  20
250
 Clusterizando bloco  21
250
 Clusterizando bloco  22
250
 Clusterizando bloco  23
250
 Clusterizando bloco  24
250
 Clusterizando bloco  25
250
 Clusterizando bloco  26
250
 Clusterizando bloco  27
250
 Clusterizando bloco  28
