In [70]:
import os
import sklearn
import pandas as pd
import torch
import wave
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler, MinMaxScaler

## Part 1

In [37]:
corpus_dir = "/home/carlosmarques/Projects/ist/PF/lab3/lab3_part1/corpus"
labels_dir = f"{corpus_dir}/labels"
wav_dir = f"{corpus_dir}/wav"

#### 1. Generate binary labels for train and dev sets

In [3]:
set_names = ["train", "dev"]

In [4]:
for name in set_names:
    label_path = f'{labels_dir}/{name}_labels.csv'
    df = pd.read_csv(label_path)
    df["label"] = df.apply(lambda x: 0 if x["kss"] <= 7.5 else 1, axis=1)
    df.to_csv(label_path, index=False)

#### 2.1 Duration extraction

In [5]:
def get_durations(set_path):
    paths = os.listdir(set_path)
    full_paths = []
    durations = []
    for x in paths:
        full_paths.append(os.path.join(set_path, x))

    for f_path in full_paths:
        f = wave.open(f_path, 'r')
        frames = f.getnframes()
        rate = f.getframerate()
        duration = frames / float(rate)
        durations.append(duration)
        
    return durations
    

In [6]:
set_names = ["train", "dev", "test"]

In [7]:
df = pd.DataFrame(index=set_names, columns=["average_duration", "total_duration"])

In [9]:
for name in set_names:
    set_path = f'{wav_dir}/{name}'
    durations = get_durations(set_path)
    df.loc[name, "average_duration"] = np.average(durations)
    df.loc[name, "total_duration"] = np.sum(durations)

In [10]:
df

Unnamed: 0,average_duration,total_duration
train,9.07166,24466.3
dev,7.85962,10524.0
test,10.1975,12675.5


#### 2.2  F/M

In [11]:
set_names = ["train", "dev"]

In [12]:
df = pd.DataFrame(index=set_names, columns=["M", "F", "SP_M", "SP_F", "NSP_M", "NSP_F"])

In [13]:
for name in set_names:
    label_path = f'{labels_dir}/{name}_labels.csv'
    label_df = pd.read_csv(label_path)
    df.loc[name, "M"] = np.sum(label_df["Gender"] == "M")
    df.loc[name, "F"] = np.sum(label_df["Gender"] == "F")
    df.loc[name, "SP_M"] = np.sum(label_df[label_df["Gender"] == "M"]["label"])
    df.loc[name, "SP_F"] = np.sum(label_df[label_df["Gender"] == "F"]["label"])
    df.loc[name, "NSP_M"] = df.loc[name, "M"] - df.loc[name, "SP_M"]
    df.loc[name, "NSP_F"] = df.loc[name, "F"] - df.loc[name, "SP_F"]

In [14]:
df

Unnamed: 0,M,F,SP_M,SP_F,NSP_M,NSP_F
train,706,1991,247,825,459,1166
dev,372,967,194,294,178,673


#### 3.4 Calculate weight of diferent classes to aid in SVM training

In [15]:
total = df.loc["train", "M"] +  df.loc["train", "F"]

In [16]:
nsp_female_weight = 1/(df.loc["train", "NSP_F"] /total)
nsp_female_weight

2.3130360205831906

In [17]:
sp_female_weight = 1/(df.loc["train", "SP_F"] /total)
sp_female_weight

3.2690909090909095

In [18]:
nsp_male_weight = 1/(df.loc["train", "NSP_M"] /total)
nsp_male_weight

5.875816993464053

In [19]:
sp_male_weight = 1/(df.loc["train", "SP_M"] /total)
sp_male_weight

10.919028340080972

##### Normalize weights

In [20]:
sp_female_weight = sp_female_weight /nsp_female_weight
sp_female_weight

1.4133333333333333

In [21]:
nsp_male_weight = nsp_male_weight / nsp_female_weight
nsp_male_weight

2.5403050108932463

In [22]:
sp_male_weight = sp_male_weight/nsp_female_weight
sp_male_weight

4.720647773279352

In [23]:
nsp_female_weight = 1

##### Store weights in training labels

In [24]:
def weight_apply(file, nsp_female_weight, sp_female_weight, nsp_male_weight, sp_male_weight):
    if file["Gender"] == 'F' and file["label"] == 0:
        weight = nsp_female_weight 
    elif file["Gender"] == 'F' and file["label"] == 1:
        weight = sp_female_weight 
    elif file["Gender"] == 'M' and file["label"] == 0:
        weight = nsp_male_weight
    elif file["Gender"] == 'M' and file["label"] == 1:
        weight = sp_male_weight    
    
    return weight

In [25]:
df = pd.read_csv(f'{labels_dir}/train_labels.csv')

In [26]:
df["weight"] = df.apply(lambda x: weight_apply(x,  nsp_female_weight, sp_female_weight, nsp_male_weight, sp_male_weight), axis=1)

In [27]:
df.to_csv(f'{labels_dir}/train_labels.csv', index=False)

#### 3. Feature extration with openSMILE

In [49]:
def extract_features(set_name, wav_dir):
    set_path = f"{wav_dir}/{set_name}"
    paths = os.listdir(set_path)
    eGeMAPS_path = f"/home/carlosmarques/Projects/ist/PF/lab3/lab3_part1/corpus/eGeMAPS/{set_name}"
    IS11_path = f"/home/carlosmarques/Projects/ist/PF/lab3/lab3_part1/corpus/IS11/{set_name}"
    
    for path in tqdm(paths):
        os.system(f"SMILExtract -C opensmile-2.3.0/config/gemaps/eGeMAPSv01a.conf -I {set_path}/{path} -csvoutput {eGeMAPS_path}/{path.replace('wav', 'csv')}")
        os.system(f"SMILExtract -C opensmile-2.3.0/config/IS11_speaker_state.conf -I {set_path}/{path} -csvoutput {IS11_path}/{path.replace('wav', 'csv')}")

In [50]:
set_names = ["train", "dev", "test"]

In [51]:
for name in set_names:
    extract_features(name, wav_dir)

100%|██████████| 2697/2697 [19:55<00:00,  2.26it/s] 
100%|██████████| 1339/1339 [08:33<00:00,  2.61it/s]
100%|██████████| 1243/1243 [09:36<00:00,  2.16it/s]


#### 4. Preprocessing

##### Concatenating feature files

In [52]:
def cat_features(feature_name, set_name, corpus_dir):
    feature_path = f"{corpus_dir}/{feature_name}/{set_name}"
    paths = os.listdir(feature_path)
    df = pd.read_csv(f"{feature_path}/{paths[0]}", sep=";")
    paths = paths[1:]
    
    for path in tqdm(paths):
        df = df.append(pd.read_csv(f"{feature_path}/{path}", sep=";"))
    
    paths = os.listdir(feature_path)
    df["name"] = paths
    df.to_csv(f"{feature_name.lower()}_{set_name}.csv",  index=False)

In [53]:
set_names = ["train", "dev", "test"]

In [54]:
feature_sets = ['eGeMAPS', "IS11"]

In [55]:
for feature_set in feature_sets:
    for name in set_names:
        cat_features(feature_set, name, corpus_dir)

100%|██████████| 2696/2696 [00:14<00:00, 185.53it/s]
100%|██████████| 1338/1338 [00:07<00:00, 190.14it/s]
100%|██████████| 1242/1242 [00:06<00:00, 184.26it/s]
100%|██████████| 2696/2696 [10:23<00:00,  4.32it/s]
100%|██████████| 1338/1338 [04:38<00:00,  4.80it/s]
100%|██████████| 1242/1242 [04:13<00:00,  4.90it/s]


##### Scaling data

In [89]:
def scale_data_sets(feature_name):
    scaler = StandardScaler()
    set_name = "train"
    feature_path = f"{feature_name}_{set_name}.csv"
    df = pd.read_csv(feature_path)
    df[df.columns[2:]] = scaler.fit_transform(df[df.columns[2:]])
    df.to_csv(f"{feature_name}_scaled_{set_name}.csv", index=False, sep=";")
    
    set_name = "dev"
    feature_path = f"{feature_name}_{set_name}.csv"
    df = pd.read_csv(feature_path)
    df[df.columns[2:]] = scaler.transform(df[df.columns[2:]])
    df.to_csv(f"{feature_name}_scaled_{set_name}.csv", index=False, sep=";")
    
    set_name = "test"
    feature_path = f"{feature_name}_{set_name}.csv"
    df = pd.read_csv(feature_path)
    df[df.columns[2:]] = scaler.transform(df[df.columns[2:]])
    df.to_csv(f"{feature_name}_scaled_{set_name}.csv", index=False, sep=";")

In [90]:
feature_sets = ['egemaps', "is11"]

In [73]:
for feature_set in tqdm(feature_sets):
    scale_data_sets(feature_set)

100%|██████████| 2/2 [01:07<00:00, 33.84s/it]


#### Remove headers and empty columns from datasets

In [80]:
def trim_df(feature_name, set_name):
    df = pd.read_csv(f"{feature_name}_scaled_{set_name}.csv", sep=";")
    df = df.drop(["name", "frameTime"], axis = 1)
    df.to_csv(f"{feature_name}_scaled_{set_name}_trimmed.csv",  sep=";", index = False, header=False)

In [81]:
set_names = ["train", "dev", "test"]
feature_names = ["egemaps", "is11"]

In [82]:
for feature_name in feature_names:
    for name in tqdm(set_names):
        trim_df(feature_name, name)

100%|██████████| 3/3 [00:01<00:00,  2.91it/s]
100%|██████████| 3/3 [00:45<00:00, 15.04s/it]
