# Data Cleaning

Une fois qu'on a identifié les problèmes dans nos données, on procède à leur nettoyage.

Cela consiste en :
- la suppression des données dupliquées
- le traitement des valeurs manquantes
- la suppression des valeurs aberrantes

Cette étape est cruciale pour garantir la qualité et la fiabilité des données avant de les utiliser pour l'analyse et la modélisation.

## Imports

In [None]:

import matplotlib.pyplot as plt
import seaborn as sns
import scipy

import numpy as np
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed

# change style
plt.style.use('ggplot')
import librosa
import librosa.display
from tqdm import tqdm
from tools import *

tqdm.pandas()
import os

import random
from datetime import datetime

from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
from params import SOUNDS_DATASET_PATH, SAMPLE_RATE, CLASS_COLORS
from tools import play_audio, load_audio_file, pad_signal


## Dataset

### Load

In [None]:
now_day_str = "20230425"
dataset_csv_path = os.path.join(SOUNDS_DATASET_PATH, f'dataset_features_extracted_{now_day_str}.csv')
print("Dataset path: ", dataset_csv_path)
df_drums = pd.read_csv(dataset_csv_path)

df_drums = df_drums

# set index to file_path
df_drums.set_index('file_path', inplace=True)
print("Dataset shape: ", df_drums.shape)
df_drums

Dataset path:  G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\dataset_features_extracted_20230425.csv
Dataset shape:  (10305, 100)


Unnamed: 0_level_0,file_name,file_extension,class,split,duration,rms_log_sum,rms_log_mean,rms_log_max,rms_log_std,rms_log_diff_abs_mean,...,spec_cent_mean,spec_cent_std,spec_bw_mean,spec_bw_std,spec_flatness_mean,spec_flatness_std,spec_rolloff_mean,spec_rolloff_std,spec_contrast_mean,spec_contrast_std
file_path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Conga\KAMAL - Conga (1).wav,KAMAL - Conga (1),.wav,Conga,train,0.221905,-36.586320,-1.829316,-0.739824,1.237576,0.269076,...,2771.188810,2460.565926,3513.523659,1573.305322,0.011382,0.027191,322.998047,135.162666,16.910134,6.121380
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Conga\KAMAL - Conga (2).wav,KAMAL - Conga (2),.wav,Conga,test,0.075170,-5.535370,-0.790767,-0.269122,0.479259,0.253621,...,5363.585376,1400.401081,6281.286529,869.637190,0.006250,0.002458,461.425781,25.366763,13.454006,5.019219
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Conga\KAMAL - Conga (3).wav,KAMAL - Conga (3),.wav,Conga,train,0.086417,-6.860503,-0.857563,-0.265424,0.504127,0.226923,...,1688.612761,562.744633,2684.366488,708.933286,0.000881,0.001732,406.439209,12.908702,15.434837,6.394566
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Conga\KAMAL - Conga (4).wav,KAMAL - Conga (4),.wav,Conga,train,0.063810,-9.422018,-1.570336,-0.670527,0.721422,0.379499,...,3074.094357,675.055216,3106.637318,440.107261,0.005416,0.005467,509.619141,152.600726,15.496504,6.475287
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Conga\KAMAL - Conga (5).wav,KAMAL - Conga (5),.wav,Conga,train,0.258231,-23.961166,-1.041790,-0.465791,0.610694,0.136201,...,3994.841115,1705.392297,4126.019895,665.609861,0.001800,0.003827,197.543733,17.565161,20.689341,9.458610
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Shaker\Mid Seed Shaker 9.aif,Mid Seed Shaker 9,.aif,Shaker,train,0.672880,-172.912320,-2.981247,-0.822484,1.529523,0.173720,...,8552.850170,1995.923836,4688.603735,1248.938149,0.154277,0.100315,3548.523370,1844.713284,13.433769,5.270034
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Shaker\Mid Seed Shaker 10.aif,Mid Seed Shaker 10,.aif,Shaker,train,0.929456,-273.343200,-3.374607,-0.728820,1.568116,0.125416,...,9341.562555,2284.763279,5029.911848,1418.301615,0.202218,0.129527,3778.412543,2690.802937,13.046287,5.105602
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Shaker\Mid Seed Shaker 11.aif,Mid Seed Shaker 11,.aif,Shaker,train,0.875011,-244.603880,-3.218472,-0.734002,1.524488,0.125789,...,9270.489384,1901.198482,5027.825233,1365.491043,0.198355,0.105156,3781.343801,2479.147790,12.827034,5.013560
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Shaker\Mid Seed Shaker 12.aif,Mid Seed Shaker 12,.aif,Shaker,train,0.875011,-246.982820,-3.249774,-0.720161,1.605657,0.150438,...,9135.135614,2430.710085,4792.150083,1420.777824,0.175519,0.128083,3836.593467,2557.461554,13.166101,5.536690


### Columns

In [None]:
print(f'Nombre de colonnes: {len(df_drums.columns)}')
df_drums.columns

Nombre de colonnes: 100


Index(['file_name', 'file_extension', 'class', 'split', 'duration',
       'rms_log_sum', 'rms_log_mean', 'rms_log_max', 'rms_log_std',
       'rms_log_diff_abs_mean', 'rms_log_crest_factor', 'zcr_sum', 'zcr_mean',
       'zcr_std', 'zcr_loudest_frame', 'temp_cent', 'attack_time', 'pitch',
       'mfcc_mean_1', 'mfcc_mean_2', 'mfcc_mean_3', 'mfcc_mean_4',
       'mfcc_mean_5', 'mfcc_mean_6', 'mfcc_mean_7', 'mfcc_mean_8',
       'mfcc_mean_9', 'mfcc_mean_10', 'mfcc_mean_11', 'mfcc_mean_12',
       'mfcc_std_1', 'mfcc_std_2', 'mfcc_std_3', 'mfcc_std_4', 'mfcc_std_5',
       'mfcc_std_6', 'mfcc_std_7', 'mfcc_std_8', 'mfcc_std_9', 'mfcc_std_10',
       'mfcc_std_11', 'mfcc_std_12', 'mfcc_max_1', 'mfcc_max_2', 'mfcc_max_3',
       'mfcc_max_4', 'mfcc_max_5', 'mfcc_max_6', 'mfcc_max_7', 'mfcc_max_8',
       'mfcc_max_9', 'mfcc_max_10', 'mfcc_max_11', 'mfcc_max_12', 'mfcc_min_1',
       'mfcc_min_2', 'mfcc_min_3', 'mfcc_min_4', 'mfcc_min_5', 'mfcc_min_6',
       'mfcc_min_7', 'mfcc_min_8', 'm

In [None]:
columns_by_prefix = get_columns_by_prefix_(df_drums.columns)
columns_by_prefix

{'split': ['split'],
 'duration': ['duration'],
 'temp': ['temp_cent'],
 'pitch': ['pitch'],
 'class': ['class'],
 'file': ['file_name', 'file_extension'],
 'rms': ['rms_log_sum',
  'rms_log_mean',
  'rms_log_max',
  'rms_log_std',
  'rms_log_diff_abs_mean',
  'rms_log_crest_factor'],
 'spec': ['spec_cent_mean',
  'spec_cent_std',
  'spec_bw_mean',
  'spec_bw_std',
  'spec_flatness_mean',
  'spec_flatness_std',
  'spec_rolloff_mean',
  'spec_rolloff_std',
  'spec_contrast_mean',
  'spec_contrast_std'],
 'zcr': ['zcr_sum', 'zcr_mean', 'zcr_std', 'zcr_loudest_frame'],
 'mfcc': ['mfcc_mean_1',
  'mfcc_mean_2',
  'mfcc_mean_3',
  'mfcc_mean_4',
  'mfcc_mean_5',
  'mfcc_mean_6',
  'mfcc_mean_7',
  'mfcc_mean_8',
  'mfcc_mean_9',
  'mfcc_mean_10',
  'mfcc_mean_11',
  'mfcc_mean_12',
  'mfcc_std_1',
  'mfcc_std_2',
  'mfcc_std_3',
  'mfcc_std_4',
  'mfcc_std_5',
  'mfcc_std_6',
  'mfcc_std_7',
  'mfcc_std_8',
  'mfcc_std_9',
  'mfcc_std_10',
  'mfcc_std_11',
  'mfcc_std_12',
  'mfcc_max_1',
 

#### Features columns

In [None]:
# get columns names with float type use .info()
features_columns = [k for k, v in df_drums.dtypes.to_dict().items() if v == 'float64' or v == 'int64']
print(f"Nombre de features: {len(features_columns)}")

Nombre de features: 96


In [None]:
df_drums[features_columns].head()

Unnamed: 0_level_0,duration,rms_log_sum,rms_log_mean,rms_log_max,rms_log_std,rms_log_diff_abs_mean,rms_log_crest_factor,zcr_sum,zcr_mean,zcr_std,...,spec_cent_mean,spec_cent_std,spec_bw_mean,spec_bw_std,spec_flatness_mean,spec_flatness_std,spec_rolloff_mean,spec_rolloff_std,spec_contrast_mean,spec_contrast_std
file_path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Conga\KAMAL - Conga (1).wav,0.221905,-36.58632,-1.829316,-0.739824,1.237576,0.269076,1.089492,0.790039,0.039502,0.034451,...,2771.18881,2460.565926,3513.523659,1573.305322,0.011382,0.027191,322.998047,135.162666,16.910134,6.12138
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Conga\KAMAL - Conga (2).wav,0.07517,-5.53537,-0.790767,-0.269122,0.479259,0.253621,0.521645,0.193359,0.027623,0.007228,...,5363.585376,1400.401081,6281.286529,869.63719,0.00625,0.002458,461.425781,25.366763,13.454006,5.019219
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Conga\KAMAL - Conga (3).wav,0.086417,-6.860503,-0.857563,-0.265424,0.504127,0.226923,0.592139,0.15625,0.019531,0.003239,...,1688.612761,562.744633,2684.366488,708.933286,0.000881,0.001732,406.439209,12.908702,15.434837,6.394566
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Conga\KAMAL - Conga (4).wav,0.06381,-9.422018,-1.570336,-0.670527,0.721422,0.379499,0.899809,0.317383,0.052897,0.01718,...,3074.094357,675.055216,3106.637318,440.107261,0.005416,0.005467,509.619141,152.600726,15.496504,6.475287
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Conga\KAMAL - Conga (5).wav,0.258231,-23.961166,-1.04179,-0.465791,0.610694,0.136201,0.575999,0.967773,0.042077,0.028791,...,3994.841115,1705.392297,4126.019895,665.609861,0.0018,0.003827,197.543733,17.565161,20.689341,9.45861


### Overview

In [None]:
df_drums.info(verbose=2)

<class 'pandas.core.frame.DataFrame'>
Index: 10305 entries, G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Conga\KAMAL - Conga (1).wav to G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Shaker\Mid Seed Shaker 13.aif
Data columns (total 100 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   file_name              10305 non-null  object 
 1   file_extension         10305 non-null  object 
 2   class                  10305 non-null  object 
 3   split                  10305 non-null  object 
 4   duration               10300 non-null  float64
 5   rms_log_sum            10300 non-null  float64
 6   rms_log_mean           10300 non-null  float64
 7   rms_log_max            10300 non-null  float64
 8   rms_log_std            10300 non-null  float64
 9   rms_log_diff_abs_mean  10300 non-null  float64
 10  rms_log_crest_factor   10300 non-null  float64
 11  zcr_sum                10300

In [None]:
df_drums.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,10300.0,1.505467,2.334635,4.643991e-02,0.295142,0.600147,1.699575,45.354331
rms_log_sum,10300.0,-335.957882,703.701301,-1.970231e+04,-315.016370,-102.537987,-38.732532,19.588580
rms_log_mean,10300.0,-2.080884,1.048473,-1.622318e+01,-2.701744,-1.946362,-1.309877,0.230454
rms_log_max,10300.0,-0.462193,0.253207,-2.434703e+00,-0.610743,-0.461931,-0.254773,0.436995
rms_log_std,10300.0,1.030672,0.560163,2.827795e-02,0.659087,0.929251,1.304707,9.146350
...,...,...,...,...,...,...,...,...
spec_flatness_std,10300.0,0.087199,0.123421,5.068475e-11,0.004044,0.026232,0.128516,0.500000
spec_rolloff_mean,10300.0,1348.715422,1647.901117,6.258669e-01,101.793324,742.100830,1975.289346,13330.109579
spec_rolloff_std,10300.0,555.361343,813.222898,0.000000e+00,49.785377,271.681785,797.926032,8177.052036
spec_contrast_mean,10300.0,15.725966,2.575864,7.243022e+00,14.115786,15.240733,16.761845,31.094175


## Missing Values

In [None]:
# get rows with all NaN values
df_drums[df_drums.isna().all(axis=1)]

Unnamed: 0_level_0,file_name,file_extension,class,split,duration,rms_log_sum,rms_log_mean,rms_log_max,rms_log_std,rms_log_diff_abs_mean,...,spec_cent_mean,spec_cent_std,spec_bw_mean,spec_bw_std,spec_flatness_mean,spec_flatness_std,spec_rolloff_mean,spec_rolloff_std,spec_contrast_mean,spec_contrast_std
file_path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [None]:
df_drums.isna().sum()

file_name             0
file_extension        0
class                 0
split                 0
duration              5
                     ..
spec_flatness_std     5
spec_rolloff_mean     5
spec_rolloff_std      5
spec_contrast_mean    5
spec_contrast_std     5
Length: 100, dtype: int64

on affichie les lignes qui contiennent des valeurs manquantes :
    - dans au moins une colonne features.
    - dans toutes les colonnes features.

In [None]:
df_drums[df_drums[features_columns].isnull().any(axis=1)].loc[:, ["file_name", "file_extension", "class"]]

Unnamed: 0_level_0,file_name,file_extension,class
file_path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Conga\808 Conga.wav,808 Conga,.wav,Conga
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Conga\Afro 808 Conga.wav,Afro 808 Conga,.wav,Conga
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Hat Open (Cymbal)\[OPEN-HATS] Nxsty UK Drill Kit V2 (1).wav,[OPEN-HATS] Nxsty UK Drill Kit V2 (1),.wav,Hat Open (Cymbal)
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Ride (Cymbal)\VIP PRODUCERS - FPC_Ride_FDrk_004.wav,VIP PRODUCERS - FPC_Ride_FDrk_004,.wav,Ride (Cymbal)
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Ride (Cymbal)\VIP PRODUCERS - FPC_Ride_GLite_004.wav,VIP PRODUCERS - FPC_Ride_GLite_004,.wav,Ride (Cymbal)


In [None]:
df_drums[df_drums[features_columns].isnull().all(axis=1)].loc[:, ["file_name", "file_extension", "class"]]

Unnamed: 0_level_0,file_name,file_extension,class
file_path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Conga\808 Conga.wav,808 Conga,.wav,Conga
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Conga\Afro 808 Conga.wav,Afro 808 Conga,.wav,Conga
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Hat Open (Cymbal)\[OPEN-HATS] Nxsty UK Drill Kit V2 (1).wav,[OPEN-HATS] Nxsty UK Drill Kit V2 (1),.wav,Hat Open (Cymbal)
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Ride (Cymbal)\VIP PRODUCERS - FPC_Ride_FDrk_004.wav,VIP PRODUCERS - FPC_Ride_FDrk_004,.wav,Ride (Cymbal)
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Ride (Cymbal)\VIP PRODUCERS - FPC_Ride_GLite_004.wav,VIP PRODUCERS - FPC_Ride_GLite_004,.wav,Ride (Cymbal)


### Process missing values

On va supprimer les lignes qui contiennent des valeurs manquantes dans toutes les colonnes features.

In [None]:
print(f"Dataset shape : {df_drums.shape}")
df_drums.dropna(axis=0, how='all', subset=features_columns, inplace=True)
print(f"Dataset shape: {df_drums.shape}")

Dataset shape : (10305, 100)
Dataset shape: (10300, 100)


## Duplicates
Suppression des duplicats

### 1. Duplicates rows

In [None]:
print(f"Nombre de lignes totalement dupliquées : {df_drums.duplicated().sum()}")

Nombre de lignes totalement dupliquées : 0


In [None]:
duplicated_focus_on_features = df_drums.duplicated(subset=features_columns)
print(
    f"Nombre de lignes dupliquées (focus on features) : {duplicated_focus_on_features.sum()} lignes (qu'on peut potentiellement supprimer)")

# print per class
df_drums[duplicated_focus_on_features].groupby("class").count()["file_name"].sort_values(ascending=False)

Nombre de lignes dupliquées (focus on features) : 246 lignes (qu'on peut potentiellement supprimer)


class
Bell                   103
808                     44
Clap                    25
Triangle                18
Crash (Cymbal)          15
Shaker                  11
Snap                    10
Kick                     9
Hat Open (Cymbal)        8
Hat Closed (Cymbal)      2
Conga                    1
Name: file_name, dtype: int64

In [None]:
duplicated_focus_on_features = df_drums.duplicated(subset=features_columns, keep=False)
# Afficher les lignes dupliquées (toutes les copies)
duplicates_df = df_drums[duplicated_focus_on_features].sort_values(by=features_columns)
duplicates_df

Unnamed: 0_level_0,file_name,file_extension,class,split,duration,rms_log_sum,rms_log_mean,rms_log_max,rms_log_std,rms_log_diff_abs_mean,...,spec_cent_mean,spec_cent_std,spec_bw_mean,spec_bw_std,spec_flatness_mean,spec_flatness_std,spec_rolloff_mean,spec_rolloff_std,spec_contrast_mean,spec_contrast_std
file_path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Clap\Clap 0872.flac,Clap 0872,.flac,Clap,train,0.046440,-4.019799,-0.803960,-0.643738,0.162203,0.158863,...,4626.470339,426.147392,4128.750167,47.113411,0.045543,0.013888,1016.367188,125.854158,11.954943,4.722839
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Clap\Clap 0875.flac,Clap 0875,.flac,Clap,train,0.046440,-4.019799,-0.803960,-0.643738,0.162203,0.158863,...,4626.470339,426.147392,4128.750167,47.113411,0.045543,0.013888,1016.367188,125.854158,11.954943,4.722839
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Shaker\Mun Roe ( DB ) Shaker 1.wav,Mun Roe ( DB ) Shaker 1,.wav,Shaker,train,0.080000,-5.809393,-0.829913,-0.584962,0.225188,0.180874,...,5933.449028,45.388188,3120.448791,69.453728,0.018186,0.002066,3294.580078,82.999655,16.146953,8.335257
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Shaker\Shaker 4 (3).wav,Shaker 4 (3),.wav,Shaker,train,0.080000,-5.809393,-0.829913,-0.584962,0.225188,0.180874,...,5933.449028,45.388188,3120.448791,69.453728,0.018186,0.002066,3294.580078,82.999655,16.146953,8.335257
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Shaker\Shaker (84).wav,Shaker (84),.wav,Shaker,train,0.089456,-13.053484,-1.631685,-1.033871,0.477110,0.322956,...,11033.629294,1655.514185,4920.181711,451.140152,0.150005,0.045579,5151.818848,1894.669516,15.115996,7.899844
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Triangle\STEEZE TRIANGLE 1.wav,STEEZE TRIANGLE 1,.wav,Triangle,train,8.432018,-3041.128200,-4.183120,-1.158522,0.526799,0.018816,...,15631.950113,1692.451759,4727.804466,537.949495,0.062326,0.020171,10972.455430,2412.069283,17.030874,13.472127
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Bell\Bell 02.wav,Bell 02,.wav,Bell,train,10.102041,-2368.555400,-2.719352,-0.553327,1.333677,0.041081,...,698.902792,108.795627,609.081676,160.199324,0.000689,0.004254,144.527102,76.609446,28.551427,9.587233
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Bell\[BELLS] Nxsty UK Drill Kit V2 (1).wav,[BELLS] Nxsty UK Drill Kit V2 (1),.wav,Bell,train,10.102041,-2368.555400,-2.719352,-0.553327,1.333677,0.041081,...,698.902792,108.795627,609.081676,160.199324,0.000689,0.004254,144.527102,76.609446,28.551427,9.587233
G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Bell\Bell 04.wav,Bell 04,.wav,Bell,train,10.105261,-3014.280000,-3.460712,-0.661780,1.353540,0.069367,...,1668.926664,171.815897,1068.841092,79.961527,0.023244,0.120804,812.377790,326.453852,24.974493,9.142261


In [None]:
# Regroupez les lignes en double en fonction de leurs valeurs de features
grouped_duplicates = duplicates_df.groupby(features_columns)

# Créez une liste contenant des listes de file_paths pour chaque groupe de duplicatas
duplicate_groups = []
for _, group in grouped_duplicates:
    duplicate_groups.append(list(group.index))

# Affichez les groupes de duplicatas
for i, group in enumerate(duplicate_groups):
    print(f"# Duplicate Group {i + 1}:")
    for file_path in group:
        print(f"  - {file_path}")
    print()

# Duplicate Group 1:
  - G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Clap\Clap 0872.flac
  - G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Clap\Clap 0875.flac

# Duplicate Group 2:
  - G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Shaker\Mun Roe ( DB ) Shaker 1.wav
  - G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Shaker\Shaker 4 (3).wav

# Duplicate Group 3:
  - G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Shaker\Shaker (84).wav
  - G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Shaker\Shaker (85).wav

# Duplicate Group 4:
  - G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Shaker\SHAKER (2).wav
  - G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Shaker\CNN - Shaker.wav
  - G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Shaker\Shaker (2).wav
  - G:\Shared drives\PFE - ING3 Mlamali\DrumClassifi

#### Delete duplicate rows values

maintenant qu'on a les lignes dupliquées, on va les supprimer en gardant une seule copie de chaque ligne dupliquée.

In [None]:
duplicates_idx_to_delete = []
file_to_delete_num_group_map = {}

# Parcourez chaque groupe de doublons
for num_group, group in enumerate(duplicate_groups, start=1):
    # Triez les fichiers audio du groupe par la taille de leur nom de fichier
    file_name_len = lambda file_path: len(os.path.basename(file_path))
    sorted_group = sorted(group, key=file_name_len)

    # Gardez le fichier audio avec le plus petit nom de fichier (le premier de la liste triée)
    to_keep = sorted_group[0]

    # Ajoutez les autres fichiers audio du groupe à la liste des fichiers à supprimer
    duplicates_idx_to_delete.extend(sorted_group[1:])

    for file_path in sorted_group[1:]:
        file_to_delete_num_group_map[file_path] = num_group

print(f"{len(set(duplicates_idx_to_delete))} lignes dupliquées à supprimer")

246 lignes dupliquées à supprimer


In [None]:
# Supprimez les autres fichiers audio du groupe de doublons du DataFrame
df_drums = df_drums.drop(file_path for file_path in duplicates_idx_to_delete)

# cleaned_df contient maintenant les données sans les doublons indésirables
print(
    f"Après cleaning, nombre de lignes dupliquées (focus on features) : {df_drums.duplicated(subset=features_columns).sum()} lignes (qu'on peut potentiellement supprimer)")

print("Dataset shape:", df_drums.shape)

Après cleaning, nombre de lignes dupliquées (focus on features) : 0 lignes (qu'on peut potentiellement supprimer)
Dataset shape: (10054, 100)


Sauvegarder dans un fichier csv les données dupliquées (pour les supprimer manuellement)

In [None]:
if not duplicates_idx_to_delete:
    duplicates_idx_to_delete.append("No duplicates")
backup_output_path = os.path.join(SOUNDS_DATASET_PATH, f"__duplicates_rows_{now_day_str}.csv")
backup_series = pd.Series(duplicates_idx_to_delete, name="file_path")
backup_series.to_csv(backup_output_path, index=False, header=True)
print(f"Backup saved to {backup_output_path}")

Backup saved to G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\__duplicates_rows_20230425.csv


### 2. Duplicates file_name

In [None]:
print(
    f"Nombre de lignes dupliquées sur la colonne 'file_name' : {df_drums.duplicated(subset=['file_name', 'file_extension']).sum()} lignes (qu'on peut potentiellement supprimer)")

df_drums[df_drums.duplicated(subset=['file_name', 'file_extension'], keep=False)].sort_values(
    by=['file_name', 'file_extension'])

Nombre de lignes dupliquées sur la colonne 'file_name' : 0 lignes (qu'on peut potentiellement supprimer)


Unnamed: 0_level_0,file_name,file_extension,class,split,duration,rms_log_sum,rms_log_mean,rms_log_max,rms_log_std,rms_log_diff_abs_mean,...,spec_cent_mean,spec_cent_std,spec_bw_mean,spec_bw_std,spec_flatness_mean,spec_flatness_std,spec_rolloff_mean,spec_rolloff_std,spec_contrast_mean,spec_contrast_std
file_path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


### 3. Duplicates (too similar) audio

Pour détecter les fichiers audio dupliqués,  on va comparer les caractéristiques audio de chaque fichier audio. Si les caractéristiques audio sont identiques, alors les fichiers audio sont dupliqués.

In [None]:
# euclidean distance with np.linalg.norm
def euclidean_distance(vector1: np.ndarray, vector2: np.ndarray):
    """
    Compute euclidean distance between two vectors
    """
    return np.linalg.norm(vector1 - vector2)


# cosine similarity with np.dot
def cosine_similarity(vector1: np.ndarray, vector2: np.ndarray):
    """
    Compute cosine similarity between two vectors
    """
    return np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))


def similarity(vector1: np.ndarray, vector2: np.ndarray, metric: str = "euclidean"):
    """
    Compute similarity between two vectors using the specified metric. Use numpy functions.
    """
    if metric == "cosine":
        sim = cosine_similarity(vector1, vector2)
        return sim
        # Normalize cosine similarity to [0, 1]
        #return (sim + 1) / 2
    elif metric == "euclidean":
        dist = euclidean_distance(vector1, vector2)
        # Normalize euclidean distance to [0, 1] by dividing by the maximum possible distance
        max_dist = np.sqrt(len(vector1))
        return 1 - (dist / max_dist)
    else:
        raise ValueError(f"Unknown metric: {metric}")


def compute_similarity_pair(file_i, row_i, df_X, metric="euclidean", threshold: float = None):
    if threshold is None:
        threshold = 0 if metric == "cosine" else 0.5

    similarities = {}
    for file_j, row_j in df_X.iterrows():
        if file_i == file_j:
            continue
        if (file_i, file_j) in similarities or (file_j, file_i) in similarities:
            continue
        vectori = row_i.to_numpy()
        vectorj = row_j.to_numpy()
        sim = similarity(vectori, vectorj, metric=metric)
        if sim >= threshold:
            similarities[(file_i, file_j)] = sim

    return similarities


def compute_similarities_parallel(df_X: pd.DataFrame, metric: str = "euclidean", threshold: float = None,
                                  n_jobs: int = -1):
    similarities = {}
    #n_jobs = -1  # Utilisez tous les cœurs disponibles
    results = Parallel(n_jobs=n_jobs)(
        delayed(compute_similarity_pair)(file_i, row_i, df_X, metric=metric, threshold=threshold)
        for file_i, row_i in tqdm(df_X.iterrows(), total=len(df_X))
    )
    for r in results:
        similarities.update(r)
    return similarities


def compute_df_similarities(df_X: pd.DataFrame, metric: str = "euclidean", threshold: float = None, n_jobs: int = -1):
    similarities = compute_similarities_parallel(df_X, metric=metric, threshold=threshold, n_jobs=n_jobs)

    df_similarities = pd.DataFrame.from_dict(
        similarities, orient='index', columns=['similarity']
    ).sort_values(by=['similarity'], ascending=False)

    print(f"Nombre de paires de fichiers audio similaires (similarity > {threshold}) : {len(df_similarities)}")

    df_similarities.reset_index(inplace=True)
    df_similarities["file_i"] = df_similarities["index"].progress_apply(lambda x: x[0])
    df_similarities["file_j"] = df_similarities["index"].progress_apply(lambda x: x[1])
    df_similarities.drop(columns=["index"], inplace=True)
    return df_similarities


df_similarities = compute_df_similarities(df_drums[features_columns], metric="cosine", threshold=0.85, n_jobs=4)
df_similarities

 69%|██████▉   | 6960/10054 [23:52<11:45,  4.39it/s]

#### Similary > 0.999

In [None]:
threshold = 0.999725
df_similarities_0_999 = df_similarities.query(f"similarity > {threshold}")

df_similarities_0_999

In [None]:
df_similarities_0_999.describe().T

In [None]:
# play some audio files with similarity > 0.999
for i, row in df_similarities_0_999.sample(10).iterrows():
    print(f"#"*30,f"similarity: {row['similarity']}")
    play_audio(row["file_i"])
    play_audio(row["file_j"])
print("#"*30,"...")
for i, row in df_similarities_0_999.tail(10).iterrows():
    print(f"#"*30,f"similarity: {row['similarity']}")
    play_audio(row["file_i"])
    play_audio(row["file_j"])

#### Delete similar files

In [None]:
# pandas display full table
#pd.set_option('display.max_rows', None)
table_loser_0_999 = pd.Series(
    df_similarities_0_999["file_i"].to_list() + df_similarities_0_999["file_j"].to_list()).value_counts()
table_loser_0_999

In [None]:
play_audio("G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Conga\Conga (119).wav")
play_audio("G:\Shared drives\PFE - ING3 Mlamali\DrumClassifier - Sounds Dataset\Conga\Conga (118).wav")

In [None]:
similar_file_to_delete = []  # list of file to delete
for i, row in df_similarities_0_999.iterrows():
    if row["file_i"] in similar_file_to_delete or row["file_j"] in similar_file_to_delete:
        continue
    if table_loser_0_999[row["file_j"]] > table_loser_0_999[row["file_i"]]:
        #print(f"{os.path.basename(row['file_j'])} lose vs. {os.path.basename(row['file_i'])} (because {table_loser_0_999[row['file_j']]} > {table_loser_0_999[row['file_i']]})")
        similar_file_to_delete.append(row["file_j"])
    else:
        #print(f"{os.path.basename(row['file_i'])} lose vs. {os.path.basename(row['file_j'])} (because {table_loser_0_999[row['file_i']]} > {table_loser_0_999[row['file_j']]})")
        similar_file_to_delete.append(row["file_i"])

len(similar_file_to_delete)

In [None]:
# get class value counts in similar_file_to_delete list
similar_class_value_counts = df_drums.loc[df_drums.index.isin(similar_file_to_delete), "class"].value_counts()
similar_class_value_counts

In [None]:
# delete similar files from cleaned_df_drums
if similar_file_to_delete:
    df_drums = df_drums.drop(similar_file_to_delete)
df_drums

on sauvegarde les fichiers à supprimer manuellement dans un fichier csv

In [None]:
if not similar_file_to_delete:
    similar_file_to_delete.append("x")
backup_output_path = os.path.join(SOUNDS_DATASET_PATH, f"__duplicates_too_similar_{threshold}_{now_day_str}.csv")
backup_series = pd.Series(similar_file_to_delete, name="file_path")
backup_series.to_csv(backup_output_path, index=False, header=True)
print(f"Backup saved to {backup_output_path}")

## Outliers

Un outlier (ou valeur aberrante) est une observation qui se situe à une distance anormalement grande des autres observations dans un ensemble de données. Les outliers peuvent être causés par des erreurs de mesure, des erreurs d'enregistrement, ou par des variations naturelles dans les données. Ils peuvent avoir un impact significatif sur l'analyse et la modélisation des données, en introduisant des biais et en réduisant la performance des modèles prédictifs.

Dans le contexte de notre projet, les outliers peuvent correspondre à des sons de batterie ayant des caractéristiques très différentes des autres sons, qui pourraient rendre difficile la classification ou l'analyse ultérieure.

### Vizualisation

In [None]:
df_drums.describe().T

In [None]:
# Taille de chaque chunk
chunk_size = 15

# Générer les chunks
chunks = list(generate_chunks(features_columns, chunk_size))

# Nombre de rows pour les sous-plots
n_rows = len(chunks)

# Créer la figure et les axes des sous-plots
fig, axs = plt.subplots(n_rows, figsize=(15, 8 * n_rows))

# Parcourir les chunks et tracer les boxplots pour chaque chunk
for i, features_columns_chunk in enumerate(chunks):
    # Tracer les boxplots pour le chunk actuel sur l'axe correspondant
    df_drums[features_columns_chunk].boxplot(ax=axs[i])

# Afficher la figure
plt.show()

### Outliers detection (IQR)

In [None]:
from collections import Counter


def get_outliers_iqr_per_class(df, column, class_column, multiplier=1.75):
    outliers_indices = []

    # Divisez le dataframe en sous-groupes en fonction des classes.
    for class_value in df[class_column].unique():
        class_df = df[df[class_column] == class_value]

        # Appliquez la méthode IQR pour chaque sous-groupe.
        Q1 = class_df[column].quantile(0.25)
        Q3 = class_df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - multiplier * IQR
        upper_bound = Q3 + multiplier * IQR
        outliers_class_df = class_df[(class_df[column] < lower_bound) | (class_df[column] > upper_bound)]

        outliers_indices.extend(outliers_class_df.index.tolist())

    return outliers_indices


# Remplacez 'class_column' par le nom de la colonne contenant les classes dans votre dataframe.
class_column = 'class'

# Parcourez toutes les colonnes de cleaned_df_drums pour lesquelles vous souhaitez détecter les outliers.
outliers_counter = Counter()
for col in tqdm(features_columns):
    outliers_indices = get_outliers_iqr_per_class(df_drums, col, class_column)
    outliers_counter.update(outliers_indices)

outliers_counter

In [None]:
outliers_counter_df = pd.DataFrame(outliers_counter.items()).set_index(0).sort_values(by=1, ascending=False)
outliers_counter_df

La valeur optimale pour limit_outliers_count dépend de la nature de vos données et de la tolérance aux outliers de vos futurs modèles d'apprentissage automatique. En général, il est préférable de commencer avec une valeur conservatrice et d'ajuster en fonction des résultats.

Étant donné que vous avez 10 000 lignes et 95 colonnes dans votre dataframe df_drums, une valeur initiale de limit_outliers_count pourrait être de 10. Cette valeur signifie que vous ne supprimerez que les lignes qui sont considérées comme des outliers dans au moins 10 des 95 colonnes. Cela vous permettra de conserver une grande partie de vos données tout en éliminant les outliers les plus extrêmes.

Cependant, il est important de garder à l'esprit que cette valeur doit être ajustée en fonction de vos observations et de l'impact des outliers sur vos modèles. Vous pouvez expérimenter différentes valeurs de limit_outliers_count et observer comment elles affectent les performances de vos modèles d'apprentissage automatique. Si vous constatez que les modèles sont sensibles aux outliers, vous pouvez augmenter la valeur de limit_outliers_count pour supprimer davantage d'outliers. À l'inverse, si les modèles ne sont pas sensibles aux outliers, vous pouvez utiliser une valeur plus faible pour conserver davantage de données.

In [None]:
limit_outliers_count = int(len(features_columns) * 0.5)
print(f"limit_outliers_count: {limit_outliers_count}")
# Trouvez les index des lignes qui ont été détectées au moins limit_outliers_count fois comme outliers.
outliers_to_remove = outliers_counter_df[outliers_counter_df[1] >= limit_outliers_count].index.tolist()
print(f"Nombre d'outliers à supprimer: {len(outliers_to_remove)}")

n_examples = 5
for k,file_outlier in enumerate(outliers_to_remove[:n_examples] + outliers_to_remove[-n_examples:]):
    index_outlier = outliers_to_remove.index(file_outlier)
    print(f"Outlier {index_outlier}/{len(outliers_to_remove)}", end=" ")
    play_audio(file_outlier)

### Delete outliers

In [None]:
# Supprimez les outliers du dataframe.
print(f"Dataframe avant nettoyage des outliers: {df_drums.shape}")
if outliers_to_remove:
    df_drums = df_drums.drop(outliers_to_remove)
print(f"Dataframe nettoyé des outliers: {df_drums.shape}")
df_drums

on sauvegarde les fichiers à supprimer manuellement dans un fichier csv

In [None]:
if not duplicates_idx_to_delete:
    duplicates_idx_to_delete.append("x")
backup_output_path = os.path.join(SOUNDS_DATASET_PATH, f"__outliers_{limit_outliers_count}_{now_day_str}.csv")
backup_series = pd.Series(duplicates_idx_to_delete, name="file_path")
backup_series.to_csv(backup_output_path, index=False, header=True)
print(f"Backup saved to {backup_output_path}")

## Save final cleaned dataset

In [None]:
df_drums.to_csv(os.path.join(SOUNDS_DATASET_PATH, f"dataset_cleaned_features_extracted_{now_day_str}.csv"), index=True)