In [1]:
import numpy as np
import pandas as pd
import torchaudio
import random
import torch
import tensorflow as tf
from torchaudio import transforms
import pickle
import os
from sklearn.model_selection import train_test_split
from IPython.display import clear_output

# Carga de los datos

In [2]:
path = "./data/kaggle/"
np.load(path + "dict.npy", allow_pickle = True)

array({'0': 'Femenino', '1': 'Masculino', '2': 'Argentina', '3': 'Chile', '4': 'Colombia', '5': 'Perú', '6': 'Venezuela'},
      dtype=object)

In [3]:
df = pd.read_csv(path + "Train.csv")
df["Id"] = path + "Train/"+df["Id"]

In [4]:
df["label_1"] = df["Expected"].apply(lambda x : int(x.split(" ")[0]))
df["label_2"] = df["Expected"].apply(lambda x : int(x.split(" ")[1]))

In [5]:
df.head()

Unnamed: 0,Id,Expected,label_1,label_2
0,./data/kaggle/Train/01216683570.wav,0 2,0,2
1,./data/kaggle/Train/00433588573.wav,0 2,0,2
2,./data/kaggle/Train/00381534896.wav,0 2,0,2
3,./data/kaggle/Train/01635825413.wav,0 2,0,2
4,./data/kaggle/Train/00325117692.wav,0 2,0,2


In [6]:
data = df["Id"].apply(lambda x : torchaudio.load(x))

In [7]:
(data.apply(lambda x : x[1]) == 48000).all()

True

Todas los datos tienen el mismo Sample rate.

In [8]:
(data.apply(lambda x : x[0].shape[0]) == 1).all()

True

Todos los datos tienen la misma cantidad de canales

In [9]:
data_audio = data.apply(lambda x : x[0])

## Transformación e imputación de datos

In [46]:
def twochannel(audio):
    
    if audio.shape[0] == 2:
        return audio
    else:
        return torch.cat([audio, audio])
    
def resize_audio(audio, max_ms = 12000):
    
    max_len = 48000 * max_ms // 1000
    rows, length = audio.shape
    
    if length > max_len:
        audio = audio[ : , : max_len]
    
    elif length < max_len:
        start = random.randint(0, max_len - length)
        end = max_len - length - start
        
        pad_start = torch.zeros((rows, start))
        pad_end = torch.zeros((rows, end))
        
        audio = torch.cat((pad_start, audio, pad_end), 1)
    
    return audio

def apply_transform(data):
    
    data_audio_tc = data.apply(lambda x : twochannel(x))
    return data_audio_tc.apply(lambda x : resize_audio(x))

In [11]:
data = apply_transform(data_audio)

In [47]:
def shift_audio(audio):
    
    shift = int((2 * random.random() - 1) * 0.5 * audio.shape[1])
    return audio.roll(shift)

In [48]:
def spectro_audio(audio):
    
    spec = transforms.MelSpectrogram(48000, n_fft = 2**10, n_mels = 100)(audio)
    return transforms.AmplitudeToDB(top_db = 80, )(spec)

In [49]:
def frec_time_mask(spec, max_mask_pct = 0.1, n_freq_masks = 1, n_time_masks = 1):
    
    _, n_mels, n_steps = spec.shape
    mask_value = spec.mean()
    aug_spec = spec
    freq_mask_param = max_mask_pct * n_mels
    for _ in range(n_freq_masks):
        aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value)

    time_mask_param = max_mask_pct * n_steps
    for _ in range(n_time_masks):
        aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec, mask_value)

    return aug_spec

In [15]:
def add_noise(audio):
    noise = torch.rand(audio.shape) * 2 - 1
    return audio + noise

In [16]:
def making_data(data, labels, shifted_data = 10, funcs = [frec_time_mask, add_noise]):
    
    X = np.array([])
    y = np.array([])
    for pos, audio in enumerate(data):
        for i in range(shifted_data):
            aux = spectro_audio(shift_audio(audio))
            while np.random.randint(2) == 0:
                func = funcs[np.random.randint(2)]
                aux = func(aux)
            if len(X) == 0:
                X = np.array([aux.numpy()])
                y = np.array([[labels[pos]]])
            else:
                X = np.concatenate((X, np.array([aux.numpy(), aux.numpy()])), axis = 0)
                y = np.concatenate((y, np.array([[labels[pos]]])), axis = 0)
            del aux
    return (X, y)

In [17]:
data_disordered = [(i, df["Expected"][pos]) for pos, i in enumerate(data)]

In [18]:
random.shuffle(data_disordered)

In [19]:
X_disordered = [i[0] for i in data_disordered]
y_disordered = [i[1] for i in data_disordered]

In [20]:
del data, data_audio, data_disordered, df

In [21]:
X = np.zeros((27900, 2, 100, 1126), dtype = np.float32)

### Generamos la data nueva

In [22]:
y = []
for pos, audio in enumerate(X_disordered):
    for i in range(20):
        print(f"{pos} de {len(X_disordered)} exactly {20 * pos + i}")
        aux = spectro_audio(shift_audio(audio))
        aux = frec_time_mask(aux)
        aux = add_noise(aux)
        X[20 * pos + i] = aux.numpy()
        if len(X) == 0:
            # X = np.array([aux.numpy()])
            y = np.array([y_disordered[pos]])
        else:
            # X = np.concatenate([X, np.array([aux.numpy()])], axis = 0)
            y = np.concatenate([y, [y_disordered[pos]]], axis = 0)
        del aux
    clear_output(wait = True)

1394 de 1395 exactly 27880
1394 de 1395 exactly 27881
1394 de 1395 exactly 27882
1394 de 1395 exactly 27883
1394 de 1395 exactly 27884
1394 de 1395 exactly 27885
1394 de 1395 exactly 27886
1394 de 1395 exactly 27887
1394 de 1395 exactly 27888
1394 de 1395 exactly 27889
1394 de 1395 exactly 27890
1394 de 1395 exactly 27891
1394 de 1395 exactly 27892
1394 de 1395 exactly 27893
1394 de 1395 exactly 27894
1394 de 1395 exactly 27895
1394 de 1395 exactly 27896
1394 de 1395 exactly 27897
1394 de 1395 exactly 27898
1394 de 1395 exactly 27899


In [34]:
# file = open('C:\\Users\\gmendez\\Documents\\Redes neuronales\\inf395_hw1\\files\\x.obj', "wb")
# np.save(file, X)
# file.close()

In [35]:
# file = open('C:\\Users\\gmendez\\Documents\\Redes neuronales\\inf395_hw1\\files\\y.obj', "wb")
# np.save(file, y)
# file.close()

In [2]:
file = open('C:\\Users\\gmendez\\Documents\\Redes neuronales\\inf395_hw1\\files\\x.obj', "rb")
X = np.load(file)
file.close()

In [3]:
file = open('C:\\Users\\gmendez\\Documents\\Redes neuronales\\inf395_hw1\\files\\y.obj', "rb")
y = np.load(file)
file.close()

## Generamos modelos : Clasificar nacionalidad

In [4]:
def make_model(activation):
    input_layer = tf.keras.layers.Input((2, 100, 1126))
    conv_1 = tf.keras.layers.Conv2D(16, 3, strides = (2,2), padding = "same", activation = "relu")(input_layer)
    pooling_1 = tf.keras.layers.MaxPooling2D(padding = 'same')(conv_1)
    conv_2 = tf.keras.layers.Conv2D(32, 3, strides = (2,2), padding = "same", activation = "relu")(pooling_1)
    pooling_2 = tf.keras.layers.MaxPooling2D(padding = 'same')(conv_2)
    conv_3 = tf.keras.layers.Conv2D(64, 3, strides = (2,2), padding = "same", activation = "relu")(pooling_2)
    flatt = tf.keras.layers.Flatten()(conv_3)
    hidden_1 = tf.keras.layers.Dense(128, activation = "relu")(flatt)
    output_layer = tf.keras.layers.Dense(5, activation = activation)(hidden_1)
    
    model = tf.keras.Model(inputs = input_layer, outputs = output_layer)
    
    model.compile(
        loss = tf.keras.losses.CategoricalCrossentropy(),
        optimizer = tf.keras.optimizers.Adam(learning_rate = 2e-4),
        metrics = tf.keras.metrics.CategoricalCrossentropy()
    )
    
    return model

In [14]:
model = make_model("sigmoid")

In [6]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 2, 100, 1126)]    0         
                                                                 
 conv2d (Conv2D)             (None, 1, 50, 16)         162160    
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, 25, 16)        0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 1, 13, 32)         4640      
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 1, 7, 32)         0         
 2D)                                                             
                                                                 
 conv2d_2 (Conv2D)           (None, 1, 4, 64)          18496 

In [7]:
cols = ["argentina", "chile", "colombia", "peru", "venezuela"]

In [11]:
y_data = pd.DataFrame(y, columns = ["target"])
y_data["gender"] = y_data["target"].apply(lambda x : int(x.split(" ")[0]))
y_data["nacionality"] = y_data["target"].apply(lambda x : int(x.split(" ")[1]))
y_data["argentina"] = y_data["nacionality"].apply(lambda x : int(x == 2))
y_data["chile"] = y_data["nacionality"].apply(lambda x : int(x == 3))
y_data["colombia"] = y_data["nacionality"].apply(lambda x : int(x == 4))
y_data["peru"] = y_data["nacionality"].apply(lambda x : int(x == 5))
y_data["venezuela"] = y_data["nacionality"].apply(lambda x : int(x == 6))

# X_train, X_val, y_train, y_val = train_test_split(X, y_data[cols], test_size = 0.1)

In [15]:
with tf.device("/CPU:0"):
    model.fit(X, y_data[cols].values, epochs = 500, validation_split = 0.1, callbacks = [tf.keras.callbacks.EarlyStopping(patience = 10)])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500


In [16]:
model.save("C:\\Users\\gmendez\\Documents\\Redes neuronales\\inf395_hw1\\models\\second")
model.save_weights("C:\\Users\\gmendez\\Documents\\Redes neuronales\\inf395_hw1\\models\\seconds_w")



INFO:tensorflow:Assets written to: C:\Users\gmendez\Documents\Redes neuronales\inf395_hw1\models\second\assets


INFO:tensorflow:Assets written to: C:\Users\gmendez\Documents\Redes neuronales\inf395_hw1\models\second\assets


In [17]:
second_model = make_model("tanh")

In [22]:
second_model = make_model("softmax")

In [23]:
with tf.device("/CPU:0"):
    second_model.fit(X, y_data[cols].values, epochs = 500, validation_split = 0.1, callbacks = [tf.keras.callbacks.EarlyStopping(patience = 10)])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500


Este modelo parece ser el que mejor resultados obtiene en el Validation set

In [24]:
second_model.save("C:\\Users\\gmendez\\Documents\\Redes neuronales\\inf395_hw1\\models\\third")
second_model.save_weights("C:\\Users\\gmendez\\Documents\\Redes neuronales\\inf395_hw1\\models\\third_w")



INFO:tensorflow:Assets written to: C:\Users\gmendez\Documents\Redes neuronales\inf395_hw1\models\third\assets


INFO:tensorflow:Assets written to: C:\Users\gmendez\Documents\Redes neuronales\inf395_hw1\models\third\assets


In [26]:
def make_model(activation):
    input_layer = tf.keras.layers.Input((2, 100, 1126))
    conv_1 = tf.keras.layers.Conv2D(16, 3, strides = (2,2), padding = "same", activation = "relu")(input_layer)
    pooling_1 = tf.keras.layers.MaxPooling2D(padding = 'same')(conv_1)
    conv_2 = tf.keras.layers.Conv2D(32, 3, strides = (2,2), padding = "same", activation = "relu")(pooling_1)
    pooling_2 = tf.keras.layers.MaxPooling2D(padding = 'same')(conv_2)
    conv_3 = tf.keras.layers.Conv2D(64, 3, strides = (2,2), padding = "same", activation = "relu")(pooling_2)
    flatt = tf.keras.layers.Flatten()(conv_3)
    hidden_1 = tf.keras.layers.Dense(256, activation = "relu")(flatt)
    hidden_2 = tf.keras.layers.Dense(128, activation = "relu")(hidden_1)
    output_layer = tf.keras.layers.Dense(5, activation = activation)(hidden_2)
    
    model = tf.keras.Model(inputs = input_layer, outputs = output_layer)
    
    model.compile(
        loss = tf.keras.losses.CategoricalCrossentropy(),
        optimizer = tf.keras.optimizers.Adam(learning_rate = 2e-4),
        metrics = tf.keras.metrics.CategoricalCrossentropy()
    )
    
    return model

In [27]:
third_model = make_model("sigmoid")

In [28]:
with tf.device("/CPU:0"):
    third_model.fit(X, y_data[cols].values, epochs = 500, validation_split = 0.1, callbacks = [tf.keras.callbacks.EarlyStopping(patience = 10)])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500


In [29]:
fourth_model = make_model("softmax")

In [31]:
with tf.device("/CPU:0"):
    fourth_model.fit(X, y_data[cols].values, epochs = 500, validation_split = 0.1, callbacks = [tf.keras.callbacks.EarlyStopping(patience = 10)])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500


## Generamos modelos : Clasificar género

In [32]:
def make_gender_model(activation):
    
    input_layer = tf.keras.layers.Input((2, 100, 1126))
    conv_1 = tf.keras.layers.Conv2D(16, 3, strides = (2,2), padding = "same", activation = "relu")(input_layer)
    pooling_1 = tf.keras.layers.MaxPooling2D(padding = 'same')(conv_1)
    conv_2 = tf.keras.layers.Conv2D(32, 3, strides = (2,2), padding = "same", activation = "relu")(pooling_1)
    pooling_2 = tf.keras.layers.MaxPooling2D(padding = 'same')(conv_2)
    conv_3 = tf.keras.layers.Conv2D(64, 3, strides = (2,2), padding = "same", activation = "relu")(pooling_2)
    flatt = tf.keras.layers.Flatten()(conv_3)
    hidden_1 = tf.keras.layers.Dense(256, activation = "relu")(flatt)
    hidden_2 = tf.keras.layers.Dense(128, activation = "relu")(hidden_1)
    output_layer = tf.keras.layers.Dense(1, activation = activation)(hidden_2)
    
    model = tf.keras.models.Model(inputs = input_layer, outputs = output_layer)
    
    model.compile(
        loss = tf.keras.losses.BinaryCrossentropy(),
        optimizer = tf.keras.optimizers.Adam(learning_rate = 2e-4),
        metrics = tf.keras.metrics.BinaryAccuracy()
    )
    
    return model

In [33]:
gender_model_new = make_gender_model("sigmoid")

In [34]:
gender_model_new.summary()

Model: "model_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_9 (InputLayer)        [(None, 2, 100, 1126)]    0         
                                                                 
 conv2d_24 (Conv2D)          (None, 1, 50, 16)         162160    
                                                                 
 max_pooling2d_16 (MaxPoolin  (None, 1, 25, 16)        0         
 g2D)                                                            
                                                                 
 conv2d_25 (Conv2D)          (None, 1, 13, 32)         4640      
                                                                 
 max_pooling2d_17 (MaxPoolin  (None, 1, 7, 32)         0         
 g2D)                                                            
                                                                 
 conv2d_26 (Conv2D)          (None, 1, 4, 64)          1849

In [36]:
with tf.device("/CPU:0"):
    gender_model_new.fit(X, y_data["gender"].values, epochs = 500, validation_split = 0.1, callbacks = [tf.keras.callbacks.EarlyStopping(patience = 10)])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500


In [37]:
gender_model_new.save("C:\\Users\\gmendez\\Documents\\Redes neuronales\\inf395_hw1\\models\\Gender_model_v2")



INFO:tensorflow:Assets written to: C:\Users\gmendez\Documents\Redes neuronales\inf395_hw1\models\Gender_model_v2\assets


INFO:tensorflow:Assets written to: C:\Users\gmendez\Documents\Redes neuronales\inf395_hw1\models\Gender_model_v2\assets


In [38]:
gender_model_new.save_weights("C:\\Users\\gmendez\\Documents\\Redes neuronales\\inf395_hw1\\models\\gender_v2_weights")

## Juntamos los modelos

In [40]:
nac_model = tf.keras.models.load_model("C:\\Users\\gmendez\\Documents\\Redes neuronales\\inf395_hw1\\models\\third")
gender_model = tf.keras.models.load_model("C:\\Users\\gmendez\\Documents\\Redes neuronales\\inf395_hw1\\models\\Gender_model_v2")

In [41]:
input_layer = tf.keras.layers.Input((2, 100, 1126))
output_layer = tf.concat([gender_model(input_layer), nac_model(input_layer)], 1)

model = tf.keras.models.Model(inputs = input_layer, outputs = output_layer)

In [42]:
model.trainable = False

In [43]:
path = "C:\\Users\\gmendez\\Documents\\Redes neuronales\\inf395_hw1\\data\\kaggle\\"
df = pd.read_csv(path + "Test.csv")

In [44]:
X_test = df["Id"].apply(lambda x : torchaudio.load(path+ "Test\\" + x)[0])

In [50]:
X_test = apply_transform(X_test)

In [51]:
X_test = np.array(X_test.apply(lambda x : spectro_audio(x).numpy()).values.tolist())

In [52]:
predictions = model.predict(X_test)



In [53]:
p = []
for gender, n1, n2, n3, n4, n5 in predictions:
    m = np.argmax([n1, n2, n3, n4, n5])
    p.append(f"{int(gender)} {m + 2}")

In [54]:
pd.DataFrame({"Id" : df["Id"].values, "Expected" : p}).to_csv("predictions_v2.csv", index = False)