# LOAD

In [1]:
import numpy as np
import pandas as pd
import sys
sys.path.append("..")
from modules.Deduplicator import Deduplicator
from modules.utils.NormalizedLevenshtein import get_normalized_levenshtein_dist

MUSIC_DATASET_PATH = '../datasets'

# CLEAN DATABASE

## Clean functions

In [2]:
# clean strings
STRING_COLUMNS = ['title','artist','album']

# replace NaN with '' on STRING_COLUMNS
def clean_strings(df):
  df[STRING_COLUMNS] = df[STRING_COLUMNS].replace(np.nan, '')
  return df

In [3]:
# clean years

def convert_year(year):
    if len(year) == 2:
        if int(year) < 50:
            return int('20' + year)
        else:
            return int('19' + year)
    elif len(year) == 4:
      return int(year)
    
    return 0

def clean_years(df):
  # parse years to the same format
  df['year'] = df['year'].replace(np.nan, '0')

  # remove all non-numerics from year column (example: '02 ==> 02)
  df['year'] = df['year'].str.replace(r'\D+', '', regex=True)

  df['year'] = df['year'].apply(lambda x: convert_year(x))

  return df

In [4]:
# remove unnecessary columns

UNUSED_COLUMNS = ['CTID', 'id',	'SourceID', 'length', 'language']
def remove_unused_columns(df):
  return df.drop(columns = UNUSED_COLUMNS)

## Clean everything

In [5]:
# a single function that calls all the above clean functions
def clean_db(df):
  df = remove_unused_columns(df)
  df = clean_strings(df)
  df = clean_years(df)
  df['number'] = df['number'].replace(np.nan, '')
  return df

# DISTANCES FUNCTIONS

In [6]:
# year distance
def year_distance(year1: int, year2: int):
    if (year1==year2): 
      return 0
    return 1

In [7]:
# number_track distance
def number_track_distance(number_t1, number_t2):
  if str(number_t1).isnumeric() and str(number_t2).isnumeric():
    number_t1 = int(number_t1)
    number_t2 = int(number_t2)

  if(number_t1 == number_t2):
    return 0
  return 1

## Custom Distance function 

In [8]:
# custom distance function
title_w = 1.5
album_w = 1.2
artist_w = 1
number_track_w = .8
year_w = .5

def distance(item, item2):
  w_sum =   0

  title_dist = 0
  album_dist = 0
  artist_dist = 0
  year_dist = 0
  number_track_dist = 0
  
  if (item['title'] and item2['title']):
    title_dist = get_normalized_levenshtein_dist(item['title'], item2['title'])
    w_sum += title_w

  if (item['album'] and item2['album']):
    album_dist = get_normalized_levenshtein_dist(item['album'], item2['album'])
    w_sum += album_w

  if (item['artist'] and item2['artist']):
    artist_dist = get_normalized_levenshtein_dist(item['artist'], item2['artist'])
    w_sum += artist_w

  if (item['year']>0 and item2['year']>0):
    year_dist = year_distance(item['year'], item2['year'])
    w_sum += year_w

  if (str(item['number']).isnumeric() and str(item2['number']).isnumeric()):
    if (int(item['number'])>0 and int(item2['number'])>0):
      number_track_dist = number_track_distance(item['number'], item2['number'])
      w_sum += number_track_w

  if (w_sum == 0): # don't divide by 0, it could happen
    return 1.0
  else:
    dist = ( (title_dist*title_w) + (album_dist*album_w) + (artist_dist*artist_w)
    + (number_track_dist*number_track_w) + (year_dist*year_w) ) / w_sum
  return dist

# Experiment

In [9]:
from itertools import combinations

def generate_golden_standard_array(df):
  # grouping all the TIDs of elements with same CID
  grouped = df.groupby('CID')['TID'].apply(list).reset_index(name='TIDs')
  grouped = grouped[grouped['TIDs'].apply(len) >= 2] # filter only grouped with at least a pair
  # print(grouped)
      
  # generating pairs from the groups
  golden_standard_array = []
  for tid_list in grouped['TIDs']:
    golden_standard_array.extend(combinations(tid_list, 2))
  
  return golden_standard_array

In [10]:
df = pd.read_csv(f'{MUSIC_DATASET_PATH}/Music_Brainz_20K.csv')

# DATASET 
df = clean_db(df)
golden_standard_array = generate_golden_standard_array(df)

print('Base toda', len(df))
print("Gabarito: ", golden_standard_array)

Base toda 19375
Gabarito:  [(1, 15184), (3, 14722), (8, 2379), (8, 3827), (2379, 3827), (9, 7799), (9, 15341), (9, 18582), (7799, 15341), (7799, 18582), (15341, 18582), (13, 162), (13, 4710), (13, 12661), (162, 4710), (162, 12661), (4710, 12661), (19, 5544), (20, 5161), (20, 9278), (20, 9781), (5161, 9278), (5161, 9781), (9278, 9781), (23, 3895), (23, 15578), (3895, 15578), (31, 15612), (41, 10409), (44, 8008), (44, 9973), (8008, 9973), (46, 18414), (51, 5706), (51, 11687), (51, 14874), (51, 17137), (5706, 11687), (5706, 14874), (5706, 17137), (11687, 14874), (11687, 17137), (14874, 17137), (64, 18246), (65, 2116), (65, 6739), (2116, 6739), (72, 8361), (72, 13214), (72, 15417), (72, 17391), (8361, 13214), (8361, 15417), (8361, 17391), (13214, 15417), (13214, 17391), (15417, 17391), (75, 16071), (76, 17991), (80, 15382), (81, 6360), (81, 9489), (81, 14067), (81, 16469), (6360, 9489), (6360, 14067), (6360, 16469), (9489, 14067), (9489, 16469), (14067, 16469), (83, 2189), (95, 1767), (95,

In [11]:
# partitions
total_rows = len(df)
first_partition_rows = int(total_rows * 0.50) # 50%
second_partition_rows = int(total_rows * 0.35) # 35%
third_partition_rows = total_rows - first_partition_rows - second_partition_rows # 15%

# Generate random indices for shuffling
np.random.seed(42)
indices = np.random.permutation(total_rows)

# Split the indices into three partitions
first_partition_indices = indices[:first_partition_rows]
second_partition_indices = indices[first_partition_rows:first_partition_rows+second_partition_rows]
third_partition_indices = indices[first_partition_rows+second_partition_rows:]

# Create the three partitions
df1 = df.loc[first_partition_indices]
df2 = df.loc[second_partition_indices]
df3 = df.loc[third_partition_indices]


print('Base particao 1', len(df1))
print('Base particao 2', len(df2))
print('Base particao 3', len(df3))

Base particao 1 9687
Base particao 2 6781
Base particao 3 2907


### Batch

In [12]:
deduplicator = Deduplicator('title', distance, 'TID', 0.45)

clusters = deduplicator.run(df)

In [13]:
deduplicator.evaluate(golden_standard_array)

~~ EVALUATION ~~
  Precision: 0.9224016853932584
  Recall: 0.4849846153846154
  F-measure: 0.6357183189481327

  TP: 7881
  FP: 663
  TN: -1111
  FN: 8369

Comparações:  100603


### Incremental

In [14]:
deduplicator_inc = Deduplicator('title', distance, 'TID', 0.45)

In [15]:
clusters_inc = deduplicator_inc.run(df1)

In [16]:
clusters_inc = deduplicator_inc.run(df2)

In [17]:
clusters_inc = deduplicator_inc.run(df3)

In [18]:
deduplicator_inc.evaluate(golden_standard_array)

~~ EVALUATION ~~
  Precision: 0.9427581261950286
  Recall: 0.4854769230769231
  F-measure: 0.6409131529774962

  TP: 7889
  FP: 479
  TN: -1111
  FN: 8361

Comparações:  102393
