## $\textit{Comparisons}$

In [1]:
import librosa
import scipy.signal as ss
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
sns.set_theme()

def extract_mfcc(file_path, n_mfcc=12, win_length=320, hop_length=160):
    wave, sr = librosa.load(file_path, mono=True, sr=None)
    if sr != 16000:
        wave = librosa.resample(wave, sr, 16000)
    mfcc = librosa.feature.mfcc(y=wave, sr=16000, n_mfcc=n_mfcc, window=ss.windows.hamming, win_length=win_length, hop_length=hop_length)
    return mfcc

In [10]:
import glob
import pickle
import pandas as pd
import os

pattern = 'gmm_*_mfcc_*_*_*.pkl'
files = glob.glob(pattern)
data_rows = []

for file in files:
    
    base_name = os.path.basename(file)
    parts = base_name.split('_')
    gender = parts[1]         
    mfcc = parts[3]           
    window_size = parts[4]  
    components = parts[5].split('.')[0] 

    with open(file, 'rb') as f:
        data = pickle.load(f)
    
    # Append the information as a row in the list
    data_rows.append({'gender': gender, 'mfcc': mfcc, 'window_size': window_size, 'components': components, 'model': data})

# Create a DataFrame from the list of rows
models = pd.DataFrame(data_rows)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


## $\textit{Acc vs MFFC}$

In [36]:
mfcc_coeff_amount = 24
n_components = 24
window_length = int(16000*0.02)
use_delta = False
use_delta_delta = False

In [11]:

def getDatasets(mfcc_coeff_amount, window_length):

    arr_mfcc = []
    arr_delta = []
    arr_delta_delta = []
    
    if not os.path.exists(f"mfcc_{mfcc_coeff_amount}_{window_length}.pkl"):

        aux = []

        for (dirpath, dirnames, filenames) in os.walk("VoxCeleb_gender"):
            for filename in filenames:
                if filename.endswith(".wav"):
                    filepath = dirpath + "/" + filename
                    aux.append({"file": filepath, "gender": "females" if "females" in dirpath else "males"})

        df = pd.DataFrame(aux)

        df["mfcc"] = pd.Series(name="mfcc", dtype=object)
        df["delta"] = pd.Series(name="delta", dtype=object)
        df["delta_delta"] = pd.Series(name="delta_delta", dtype=object)

        for i in range(len(df)):
            mfcc_coeffs = extract_mfcc(df.loc[i, "file"], n_mfcc=mfcc_coeff_amount, win_length=window_length)
            for i, coeffs in enumerate(mfcc_coeffs):
                if np.linalg.norm(coeffs) < 1:
                    mfcc_coeffs.pop(i)

            arr_mfcc.append(mfcc_coeffs.T)
            arr_delta.append(librosa.feature.delta(mfcc_coeffs.T))
            arr_delta_delta.append(librosa.feature.delta(mfcc_coeffs.T, order=2))
        df["mfcc"] = arr_mfcc
        df["delta"] = arr_delta
        df["delta_delta"] = arr_delta_delta
        df.to_pickle(f"mfcc_{mfcc_coeff_amount}_{window_length}.pkl")
    else:
        print("File Found")
        df = pd.read_pickle(f"mfcc_{mfcc_coeff_amount}_{window_length}.pkl")

    return df

In [38]:
dataset = getDatasets(mfcc_coeff_amount, window_length)

In [None]:
dataset

In [26]:
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import GridSearchCV
import pickle


def test(model_male, model_female, X_test, y_test):

    predictions = []
    
    scores_gmm = pd.Series(index=y_test.index)

    for index, row in X_test.iterrows():
        score = model_male.score_samples(row["mfcc"]) - model_female.score_samples(row["mfcc"])
        scores_gmm[index] = score.mean()

    for index in y_test.index:
        if scores_gmm[index] > 0:
            predictions.append('males')
        else:
            predictions.append('females')

    # Convert predictions to a pandas Series
    predictions = pd.Series(predictions, index=y_test.index)

    # Calculate the accuracy
    accuracy = (predictions == y_test).mean()

    # Print the accuracy
    print(f'Accuracy: {accuracy * 100:.2f}%')
    
    return accuracy * 100

In [35]:
f = models[models.mfcc=="24"][models.components=="24"][models.gender=="female"]["model"]
m = models[models.mfcc=="24"][models.components=="24"][models.gender=="male"]["model"]

  f = models[models.mfcc=="24"][models.components=="24"][models.gender=="female"]["model"]
  m = models[models.mfcc=="24"][models.components=="24"][models.gender=="male"]["model"]


In [None]:
test(m, f, )

In [14]:
models

Unnamed: 0,gender,mfcc,window_size,components,model
0,female,12,320,12,"GaussianMixture(max_iter=1000, n_components=12)"
1,female,12,320,16,"GaussianMixture(max_iter=1000, n_components=16)"
2,female,12,320,8,"GaussianMixture(max_iter=1000, n_components=8)"
3,female,12,320,8,"GaussianMixture(max_iter=1000, n_components=8)"
4,female,16,320,16,"GaussianMixture(max_iter=1000, n_components=16)"
5,female,18,320,18,"GaussianMixture(max_iter=1000, n_components=18)"
6,female,24,320,10,"GaussianMixture(max_iter=1000, n_components=10)"
7,female,24,320,16,"GaussianMixture(max_iter=1000, n_components=16)"
8,female,24,320,24,GaussianMixture(n_components=24)
9,female,24,320,6,"GaussianMixture(max_iter=1000, n_components=6)"


In [21]:
ventas = pd.DataFrame(
    np.random.randint(1, 100, 120).reshape(12, 10),
    columns = np.arange(2009, 2019),
    index = ["ene", "feb", "mar", "abr", "may", "jun", "jul", "ago", "sep", "oct", "nov", "dic"]
)

## $\textit{Acc vs Components}$

## $\textit{With delta vs Without delta}$