In [1]:
import pandas as pd
import polars as pl
import re 
import numpy as np
import Levenshtein as lev
import sys
sys.path.append("..")
from modules.Deduplicator import Deduplicator
from modules.DeduplicatorPolars import DeduplicatorPolars

# Cleaning

In [2]:
STRING_COLUMNS = ['title','artist','track01']

# replace NaN with '' on STRING_COLUMNS
def clean_strings(df):
  df[STRING_COLUMNS] = df[STRING_COLUMNS].replace(np.nan, '')
  return df

# We don't remove non alphanumerics now, because we wouldn't be able to 
# undestand anything when we analyze the data. So we call this function 
# when we calculate the strings_distance
def remove_non_alphanum(string: str):
  return re.sub(r'\W+', '', string)

UNUSED_COLUMNS = ['id',	"category","genre","cdextra","year","track02","track03","track04","track05","track06","track07","track08","track09","track10","track11","track12","track13","track14","track15","track16","track17","track18","track19","track20","track21","track22","track23","track24","track25","track26","track27","track28","track29","track30","track31","track32","track33","track34","track35","track36","track37","track38","track39","track40","track41","track42","track43","track44","track45","track46","track47","track48","track49","track50","track51","track52","track53","track54","track55","track56","track57","track58","track59","track60","track61","track62","track63","track64","track65","track66","track67","track68","track69","track70","track71","track72","track73","track74","track75","track76","track77","track78","track79","track80","track81","track82","track83","track84","track85","track86","track87","track88","track89","track90","track91","track92","track93","track94","track95","track96","track97","track98","track99"]
def remove_unused_columns(df):
  return df.drop(columns = UNUSED_COLUMNS)

# a single function that calls all the above clean functions
def clean_db(df):
  df = remove_unused_columns(df) # TODO : try not cleaning (much data in memory) to see how to improve operations
  df = clean_strings(df)
  return df

# Distance function

In [3]:
def string_distance(string1: str, string2: str):
  x = remove_non_alphanum(string1).lower()
  y = remove_non_alphanum(string2).lower()
  
  dist = lev.distance(x, y) 

  max_len = max(len(x), len(y))
  if (max_len == 0):
    return 0
  normalized = (max_len-dist) / max_len 
  normalized_dist = 1-normalized
  return normalized_dist

In [4]:
title_w = 1.2
artist_w = 1
track1_w = .8

def distance(item: dict, item2: dict): 
  w_sum = 0

  title_dist = 0
  artist_dist = 0
  track1_dist = 0
  
  if (item['title'] and item2['title']):
    title_dist = string_distance(item['title'], item2['title'])
    w_sum += title_w

  if (item['artist'] and item2['artist']):
    artist_dist = string_distance(item['artist'], item2['artist'])
    w_sum += artist_w

  if (item['track01'] and item2['track01']):
    track1_dist = string_distance(item['track01'], item2['track01'])
    w_sum += track1_w

  dist = ( (title_dist*title_w) + (artist_dist*artist_w) ) + (track1_dist*track1_w) / w_sum
  return dist

# Deduplication

In [5]:
df = pd.read_csv('cd.csv', delimiter=';', doublequote=False)

# evaluation
gold_standard_df = pd.read_csv('cd_gold.csv', delimiter=';')
gold_standard_pairs = gold_standard_df.values.tolist()

# DATASET 
df = clean_db(df)

second_partition_indices = df.sample(frac=0.35, random_state=42).index

# Split the dataset into 2 incrementals
df2 = df.loc[second_partition_indices]
df1 = df.drop(second_partition_indices)

print('Base toda', len(df))
print('Base particao 1', len(df1))
print('Base particao 2', len(df2))

Base toda 9763
Base particao 1 6346
Base particao 2 3417


### BATCH

In [6]:
deduplicator = Deduplicator('title', distance, 'pk', 0.25)

clusters = deduplicator.run(df)
deduplicator.evaluate(gold_standard_pairs)

~~ EVALUATION ~~
  Precision: 0.9459459459459459
  Recall: 0.7023411371237458
  F-measure: 0.8061420345489442



In [7]:
deduplicator_pl = DeduplicatorPolars('title', distance, 'pk', 0.25)

clusters_pl = deduplicator_pl.run(df)
deduplicator_pl.evaluate(gold_standard_pairs)

~~ EVALUATION ~~
  Precision: 0.9459459459459459
  Recall: 0.7023411371237458
  F-measure: 0.8061420345489442



### Incremental

In [8]:
deduplicator_inc = Deduplicator('title', distance, 'pk', 0.25)

clusters_inc = deduplicator_inc.run(df1)
deduplicator_inc.evaluate(gold_standard_pairs)

~~ EVALUATION ~~
  Precision: 0.9591836734693877
  Recall: 0.31438127090301005
  F-measure: 0.473551637279597



In [9]:
clusters_inc = deduplicator_inc.run(df2)

In [10]:
deduplicator_inc.evaluate(gold_standard_pairs)

~~ EVALUATION ~~
  Precision: 0.9459459459459459
  Recall: 0.7023411371237458
  F-measure: 0.8061420345489442

