###  __Python Data Wrangling Duplicate__

##### __Explicit duplicates__

These are rows or entries in a dataset that are exactly identical across all columns.

They can be identified using methods like DataFrame.duplicated() in Pandas, which checks for exact matches.

In [3]:
import pandas as pd

df_music = pd.read_csv('DataSets/music_log.csv')

print(df_music.duplicated())
print()

print(f"Duplicates removed: {df_music.duplicated().sum()}")

0        False
1        False
2         True
3        False
4        False
         ...  
67958    False
67959    False
67960    False
67961    False
67962     True
Length: 67963, dtype: bool

Duplicates removed: 5313


##### _.drop_duplicates()_

Indexes will change because of the duplicate removal

In [4]:
df_music = df_music.drop_duplicates() # df_music.drop_duplicates(inplace=True)
print(f"Duplicates removed: {df_music.duplicated().sum()}, After using .drop_duplicates.")
print()

print(df_music) 

Duplicates removed: 0, After using .drop_duplicates.

        user_id  total play                                  Artist    genre  \
0      BF6EA5AF   92.851388                              Marina Rei      pop   
1      FB1E568E  282.981000                            Stive Morgan  ambient   
3      EF15C7BA    8.966000                                     NaN    dance   
4      82F52E69  193.776327                                  Rixton      pop   
5      4166D680    3.007000  Henry Hall & His Gleneagles Hotel Band     jazz   
...         ...         ...                                     ...      ...   
67957  18510741  109.000000                             Steel Pulse   reggae   
67958  2E27DF51  220.551837                            Nadine Coyle      pop   
67959  4F29D4D5   26.127000                            Digital Hero    dance   
67960  26B7058C  292.455000                                 Red God    metal   
67961  DB0038A8   11.529112                            Less Chapel

To rearrange the indexes in order to follow sequence it is used _.reset_index()_

In [15]:
df_music = df_music.drop_duplicates().reset_index()
print(df_music)


       level_0  index   user_id  total play  \
0            0      0  BF6EA5AF   92.851388   
1            1      1  FB1E568E  282.981000   
2            2      3  EF15C7BA    8.966000   
3            3      4  82F52E69  193.776327   
4            4      5  4166D680    3.007000   
...        ...    ...       ...         ...   
62645    62645  67957  18510741  109.000000   
62646    62646  67958  2E27DF51  220.551837   
62647    62647  67959  4F29D4D5   26.127000   
62648    62648  67960  26B7058C  292.455000   
62649    62649  67961  DB0038A8   11.529112   

                                       Artist    genre                   track  
0                                  Marina Rei      pop                  Musica  
1                                Stive Morgan  ambient             Love Planet  
2                                         NaN    dance     Loving Every Minute  
3                                      Rixton      pop  Me And My Broken Heart  
4      Henry Hall & His Glenea

In order to delete the column index that was added due to the use of _.drop_duplicate()_, it is used _.reset_index(drop=True)_

In [16]:
import pandas as pd

df_music = pd.read_csv('DataSets/music_log.csv')

df_music = df_music.drop_duplicates().reset_index(drop=True) 
print(df_music)

        user_id  total play                                  Artist    genre  \
0      BF6EA5AF   92.851388                              Marina Rei      pop   
1      FB1E568E  282.981000                            Stive Morgan  ambient   
2      EF15C7BA    8.966000                                     NaN    dance   
3      82F52E69  193.776327                                  Rixton      pop   
4      4166D680    3.007000  Henry Hall & His Gleneagles Hotel Band     jazz   
...         ...         ...                                     ...      ...   
62645  18510741  109.000000                             Steel Pulse   reggae   
62646  2E27DF51  220.551837                            Nadine Coyle      pop   
62647  4F29D4D5   26.127000                            Digital Hero    dance   
62648  26B7058C  292.455000                                 Red God    metal   
62649  DB0038A8   11.529112                            Less Chapell      pop   

                        track  
0      

##### __Implicit duplicates__

These are rows or entries that are not exactly identical but represent the same information due to typos, case differences, or variations in formatting.

They require additional processing to identify, such as:

Normalizing data (e.g., converting to lowercase).

Using string similarity measures (e.g., Levenshtein distance).

Domain-specific rules to determine equivalence.

##### _.unique() & .nunique()_

In [12]:
import pandas as pd

rating = ['date', 'name', 'points']
players = [
        ['2018.01.01',  'Rafael Nadal', 10645],
        ['2018.01.08',  'Rafael Nadal', 10600],
        ['2018.01.29',  'Rafael Nadal', 9760],
        ['2018.02.19',  'Roger Federer', 10105], 
        ['2018.03.05',  'Roger Federer', 10060],
        ['2018.03.19',  'Roger Federerr', 9660],
        ['2018.04.02',  'Rafael Nadal Parera', 8770],
        ['2018.06.18',  'Roger Fedrer', 8920],
        ['2018.06.25',  'Rafael Nadal Parera', 8770],
        ['2018.07.16',  'Rafael Nadal Parera', 9310],
        ['2018.08.13',  'Rafael Nadal Parera', 10220],
        ['2018.08.20',  'Rafael Nadal Parera', 10040],
        ['2018.09.10',  'Rafael Nadal Parera', 8760],
        ['2018.10.08',  'Rafael Nadal Parera', 8260],
        ['2018.10.15',  'Rafael Nadal Parera', 7660],
        ['2018.11.05',  'Novak Djokovic', 8045],
        ['2018.11.19',  'Novak Djokovic', 9045]
]

df_tennis = pd.DataFrame(data=players, columns=rating)
print(df_tennis)
print()

print(f"implicit duplicates: \n{df_tennis['name'].unique()}")
print()
print(f"Number of unique values: {df_tennis['name'].nunique()}")


          date                 name  points
0   2018.01.01         Rafael Nadal   10645
1   2018.01.08         Rafael Nadal   10600
2   2018.01.29         Rafael Nadal    9760
3   2018.02.19        Roger Federer   10105
4   2018.03.05        Roger Federer   10060
5   2018.03.19       Roger Federerr    9660
6   2018.04.02  Rafael Nadal Parera    8770
7   2018.06.18         Roger Fedrer    8920
8   2018.06.25  Rafael Nadal Parera    8770
9   2018.07.16  Rafael Nadal Parera    9310
10  2018.08.13  Rafael Nadal Parera   10220
11  2018.08.20  Rafael Nadal Parera   10040
12  2018.09.10  Rafael Nadal Parera    8760
13  2018.10.08  Rafael Nadal Parera    8260
14  2018.10.15  Rafael Nadal Parera    7660
15  2018.11.05       Novak Djokovic    8045
16  2018.11.19       Novak Djokovic    9045

implicit duplicates: 
['Rafael Nadal' 'Roger Federer' 'Roger Federerr' 'Rafael Nadal Parera'
 'Roger Fedrer' 'Novak Djokovic']

Number of unique values: 6


##### _.replace()_

In [14]:
df_tennis["name"] = df_tennis["name"].replace("Roger Federerr", "Roger Federer")
df_tennis["name"] = df_tennis["name"].replace("Roger Fedrer", "Roger Federer")
df_tennis["name"] = df_tennis["name"].replace("Rafael Nadal", "Rafael Nadal Parera")
# df_tennis["name"] = df_tennis["name"].replace("Roger Federerr", "Roger Federer", inplace = True)
# df_tennis["name"] = df_tennis["name"].replace("Roger Fedrer", "Roger Federer", inplace = True)
# df_tennis["name"] = df_tennis["name"].replace("Rafael Nadal", "Rafael Nadal Parera", inplace = True)

In [None]:
import pandas as pd

def replace_wrong_values(dt_frm, clmn_nm, wrng_vls, crct_vl):
        
    for wrong_value in wrng_vls:
        
        dt_frm[clmn_nm] = dt_frm[clmn_nm].replace(wrong_value, crct_vl)
    
    return dt_frm

rating = ['date', 'name', 'points']

players = [
        ['2018.01.01',  'Rafael Nadal', 10645],
        ['2018.01.08',  'Rafael Nadal', 10600],
        ['2018.01.29',  'Rafael Nadal', 9760],
        ['2018.02.19',  'Roger Federer', 10105], 
        ['2018.03.05',  'Roger Federer', 10060],
        ['2018.03.19',  'Roger Federerr', 9660],
        ['2018.04.02',  'Rafael Nadal Parera', 8770],
        ['2018.06.18',  'Roger Fedrer', 8920],
        ['2018.06.25',  'Rafael Nadal Parera', 8770],
        ['2018.07.16',  'Rafael Nadal Parera', 9310],
        ['2018.08.13',  'Rafael Nadal Parera', 10220],
        ['2018.08.20',  'Rafael Nadal Parera', 10040],
        ['2018.09.10',  'Rafael Nadal Parera', 8760],
        ['2018.10.08',  'Rafael Nadal Parera', 8260],
        ['2018.10.15',  'Rafael Nadal Parera', 7660],
        ['2018.11.05',  'Novak Djokovic', 8045],
        ['2018.11.19',  'Novak Djokovic', 9045]
]
df_tennis = pd.DataFrame(data=players, columns=rating)

duplicates = ["Roger Federerr", "Roger Fedrer"]
name = "Roger Federer"
df_tennis = replace_wrong_values(df_tennis, "name", duplicates, name)
print(df_tennis)

          date                 name  points
0   2018.01.01         Rafael Nadal   10645
1   2018.01.08         Rafael Nadal   10600
2   2018.01.29         Rafael Nadal    9760
3   2018.02.19        Roger Federer   10105
4   2018.03.05        Roger Federer   10060
5   2018.03.19        Roger Federer    9660
6   2018.04.02  Rafael Nadal Parera    8770
7   2018.06.18        Roger Federer    8920
8   2018.06.25  Rafael Nadal Parera    8770
9   2018.07.16  Rafael Nadal Parera    9310
10  2018.08.13  Rafael Nadal Parera   10220
11  2018.08.20  Rafael Nadal Parera   10040
12  2018.09.10  Rafael Nadal Parera    8760
13  2018.10.08  Rafael Nadal Parera    8260
14  2018.10.15  Rafael Nadal Parera    7660
15  2018.11.05       Novak Djokovic    8045
16  2018.11.19       Novak Djokovic    9045


##### __Implicit Duplicated Cleanse__

In [14]:
import pandas as pd

df_music = pd.read_csv('DataSets/music_log_raw.csv')


List the unique genre

In [15]:
df_music = df_music.dropna(subset=["genre"])
print(df_music["genre"].unique())
print()

['pop' 'ambient' 'dance' 'jazz' 'classicmetal' 'electronic' 'indie'
 'hiphop' 'spoken' 'new' 'latin' 'extrememetal' 'instrumental' 'classical'
 'alternative' 'rock' 'german' 'french' 'metal' 'dubstep' 'house'
 'miscellaneous' 'rap' 'world' 'country' 'punk' 'rusrap' 'rnb' 'beats'
 'ukrrock' 'inspiritual' 'ruspop' 'caucasian' 'rusrock' 'dub' 'soundtrack'
 'folk' 'shanson' 'fairytail' 'hard-n-heavy' 'romance' 'religious'
 'hardcore' 'orchestral' 'minimal' 'film' 'spiritual' 'melodic' 'trance'
 'comedy' 'reggae' 'deep' 'mpb' 'techno' 'reggaeton' 'singer' 'karaoke'
 'children' 'adult' 'western' 'psychedelic' 'grime' 'christian' 'holiday'
 'argentinetango' 'disco' 'lounge' 'urban' 'local' 'progressive' 'other'
 'funk' 'blues' 'easy' 'dancehall' 'tatar' 'conjazz' 'drum' 'chill' 'jpop'
 'fitness' 'gospel' 'brazilian' 'vocal' 'chanson' 'gothic' 'irish' 'k-pop'
 'acoustic' 'industrial' 'numetal' 'soul' 'experimental' 'relax' 'mexican'
 'videogame' 'glitch' 'worldmusic' 'postrock' 'folkmetal' 'bo

List and preview the possible implicit duplicated

In [16]:
identical = []
gnr_cmprd = []

for gnr in df_music["genre"].unique():
      
    if gnr in gnr_cmprd:
        
        continue
    
    else:
        
        if gnr not in identical:
        
            identical.append(gnr)
    
        for cmpr in df_music["genre"].unique():
            
            if gnr in cmpr and cmpr not in identical and cmpr not in gnr_cmprd:
            
                identical.append(cmpr)
    
        identical = list(set(identical))
        gnr_cmprd.extend(identical)
            
    print(identical)
    identical = []
        
        

['jpop', 'k-pop', 'cantopop', 'electropop', 'ruspop', 'pop', 'synthpop', 'indipop', 'dancepop', 'asiapop', 'mandopop']
['ambient']
['dancehall', 'dance']
['tradjazz', 'jazz', 'conjazz', 'nujazz']
['classicmetal']
['electronic']
['indie']
['hiphop']
['spoken']
['new', 'newage', 'newwave']
['latin', 'latino']
['extrememetal']
['instrumental']
['classical']
['alternative', 'alternativepunk']
['rock', 'deutscherock', 'rockabilly', 'ukrrock', 'skarock', 'synthrock', 'rusrock', 'folkrock', 'stonerrock', 'deutschrock', 'postrock']
['german']
['french']
['folkmetal', 'epicmetal', 'numetal', 'metal', 'metalcore', 'progmetal']
['dubstep']
['house', 'chillhouse']
['miscellaneous']
['rap', 'rusrap']
['world', 'worldmusic']
['country']
['punk']
['rnb']
['beats']
['inspiritual']
['caucasian']
['dub']
['soundtrack']
['folk', 'eurofolk', 'folklore']
['shanson']
['fairytail']
['hard-n-heavy']
['romance']
['religious']
['hardcore', 'posthardcore']
['orchestral']
['minimal']
['film']
['spiritual']
['melo

Function to separate implicit duplicated

In [17]:
def genre_correction(gnr_c, implct_dplcts_c):
    
    position = gnr_c.find(implct_dplcts_c)
    
    gnr_c = gnr_c[:position] + "-" + gnr_c[position:]
    
    return gnr_c

Function to correct implicit duplicated

In [18]:
def replace_implicit_duplicated(dt_frm, clmn_nm):
    
    implct_dplcts = ["pop", "jazz", "rock", "metal", "house", "rap", "folk", "hardcore", "trance"]
    
    for gnr in dt_frm[clmn_nm]:
        
        if "'" in gnr:
            
            dt_frm[clmn_nm] = dt_frm[clmn_nm].replace(gnr, gnr.replace("'", "-"))
        
        else:
            
            for id_gnr in implct_dplcts:
            
                if gnr == id_gnr:
    
                    continue
                
                else:
                    
                    if gnr.endswith(id_gnr):
                        
                        dt_frm[clmn_nm] = dt_frm[clmn_nm].replace(gnr, genre_correction(gnr, id_gnr))
                
    return dt_frm

In [19]:
df_music = replace_implicit_duplicated(df_music, "genre")
print(df_music)

        user_id  total play        Artist    genre                   track
0      BF6EA5AF   92.851388    Marina Rei      pop                  Musica
1      FB1E568E  282.981000  Stive Morgan  ambient             Love Planet
2      FB1E568E  282.981000  Stive Morgan  ambient             Love Planet
3      EF15C7BA    8.966000           NaN    dance     Loving Every Minute
4      82F52E69  193.776327        Rixton      pop  Me And My Broken Heart
...         ...         ...           ...      ...                     ...
67957  18510741  109.000000   Steel Pulse   reggae           Chant A Psalm
67958  2E27DF51  220.551837  Nadine Coyle      pop           Girls On Fire
67959  4F29D4D5   26.127000  Digital Hero    dance               The Model
67960  26B7058C  292.455000       Red God    metal               Действуй!
67961  DB0038A8   11.529112  Less Chapell      pop                    Home

[64661 rows x 5 columns]
