# Notebook of COPERIA UVigo-GTM team
# @autor: José M. Ramírez @email: jmramirez@gts.uvigo.es @date: 2023-02-27 @version: 0.1

In [71]:
import os
import random
import string
import torch
import torchaudio
import opensmile

import pandas as pd
import numpy as np
from tqdm import tqdm

In [72]:
# Define important paths
path_data = '/home/jsanhcez/Documentos/Proyectos/99_to_do_COPERIA/repos/coperia_api/dataset_dicoperia/'
path_feats = 'data/features/'
path_results = 'results/'
path_models = 'models/'
path_notebooks = '/home/jsanhcez/Documentos/Proyectos/99_to_do_COPERIA/repos/coperia_api/notebooks'

In [73]:
# Define the data to be used
CLASSES = ['covid-control', 'covid-persistente']

dicoperia_metadata = pd.read_csv(os.path.join(path_data, 'metadata_dicoperia.csv'), decimal=',')
dicoperia_filters = {'audio_id': ['c15e54fc-5290-4652-a3f7-ff3b779bd980', '244b61cc-4fd7-4073-b0d8-7bacd42f6202'],
                     'patient_type': ['coperia-rehab'],
                     'audio_type': ['/a/'],
                     'audio_moment': ['after']}

In [74]:
def make_dicoperia_metadata(root_path:str, metadata: pd.DataFrame, filter: dict) -> pd.DataFrame:
    """
    Make a metadata file for the COPERIA dataset filtering some columns
    :param root_path: root path of the data directory
    :param metadata: a list with all the audio samples in COPERIA as an Audio class
    :param filter: a dictionary with the columns and values to filter
    :return: a pandas dataframe with the metadata of the DICOPERIA dataset
    """
    print('Filtering the metadata...')

    df = metadata.copy()
    for key, values in tqdm(filter.items()):
        df = df[~df[key].isin(values)]
    df.to_csv(os.path.join(root_path, 'metadata_dicoperia.csv'), index=False, decimal=',')
    print('Filtering DONE!!')
    df.replace(CLASSES, [0, 1], inplace=True)
    return df

exp_metadata = make_dicoperia_metadata(path_notebooks, dicoperia_metadata, dicoperia_filters)

Filtering the metadata...


100%|██████████| 4/4 [00:00<00:00, 747.75it/s]

Filtering DONE!!





In [75]:
# Global seed
SEED = 42
# Define the k-folds
K_NUM = 5
SAMPLE_GAIN_TESTING = 0.05
# Define the interest dataset columns
AUDIO_ID_COLUMN = 'audio_id'
PATIENT_ID_COLUMN = 'patient_id'

CLASS_COLUMN = 'patient_type'

In [76]:
from sklearn.model_selection import train_test_split

# Making the subsets by patients
patient_data = exp_metadata[[PATIENT_ID_COLUMN, CLASS_COLUMN]].drop_duplicates()
patient_id = patient_data[PATIENT_ID_COLUMN]
patient_class = patient_data[CLASS_COLUMN]

patients_train, patients_test, patient_labels_train, patient_labels_test = train_test_split(patient_id, patient_class, test_size=SAMPLE_GAIN_TESTING, random_state=SEED, stratify=patient_class)

# Using the patient subsets to select the audio samples
audio_data_train = exp_metadata[(exp_metadata[PATIENT_ID_COLUMN].isin(patients_train))]
audio_train = audio_data_train[AUDIO_ID_COLUMN]
audio_label_train = audio_data_train[CLASS_COLUMN]

audio_data_test = exp_metadata[(exp_metadata[PATIENT_ID_COLUMN].isin(patients_test))]
audio_test = audio_data_test[AUDIO_ID_COLUMN]
audio_label_test = audio_data_test[CLASS_COLUMN]

# Print the final length of each subset
print(f"Test-set: {len(patients_test)} patients & {len(audio_data_test)} samples")
print(f"Train-set: {len(patients_train):} patients & {len(audio_data_train)} samples")

Test-set: 7 patients & 21 samples
Train-set: 131 patients & 396 samples


In [77]:
# Feature extractor
class FeatureExtractor:
    """
    Class for feature extraction
    args: input arguments dictionary
    Mandatory arguments: resampling_rate, feature_type, window_size, hop_length
    For MFCC: f_max, n_mels, n_mfcc
    For MelSpec/logMelSpec: f_max, n_mels
    Optional arguments: compute_deltas, compute_delta_deltas
    """

    def __init__(self, args: dict):

        self.args = args
        self.resampling_rate = self.args['resampling_rate']
        assert (args['feature_type'] in ['MFCC', 'MelSpec', 'logMelSpec', 'ComParE_2016_llds', 'ComParE_2016_voicing',
                                         'ComParE_2016_spectral',
                                         'ComParE_2016_mfcc', 'ComParE_2016_rasta', 'ComParE_2016_basic_spectral',
                                         'ComParE_2016_energy'
                                         ]), (
            'Expected the feature_type to be MFCC / MelSpec / logMelSpec / ComParE_2016')

        if self.args['feature_type'] == 'MFCC':
            self.feature_transform = torchaudio.transforms.MFCC(sample_rate=self.resampling_rate,
                                                                n_mfcc=int(self.args['n_mfcc']),
                                                                melkwargs={
                                                                    'n_fft': int(float(self.args[
                                                                                           'window_size']) * 1e-3 * self.resampling_rate),
                                                                    'n_mels': int(self.args['n_mels']),
                                                                    'f_max': int(self.args['f_max']),
                                                                    'hop_length': int(float(self.args[
                                                                                                'hop_length']) * 1e-3 * self.resampling_rate)})
        elif self.args['feature_type'] in ['MelSpec', 'logMelSpec']:
            self.feature_transform = torchaudio.transforms.MelSpectrogram(sample_rate=self.resampling_rate,
                                                                          n_fft=int(float(self.args[
                                                                                              'window_size']) * 1e-3 * self.resampling_rate),
                                                                          n_mels=int(self.args['n_mels']),
                                                                          f_max=int(self.args['f_max']),
                                                                          hop_length=int(float(self.args[
                                                                                                   'hop_length']) * 1e-3 * self.resampling_rate))
        elif 'ComParE_2016' in self.args['feature_type']:
            self.feature_transform = opensmile.Smile(feature_set=opensmile.FeatureSet.ComParE_2016,
                                                     feature_level=opensmile.FeatureLevel.LowLevelDescriptors,
                                                     sampling_rate=self.resampling_rate)
        else:
            raise ValueError('Feature type not implemented')

    def _read_audio(self, filepath):
        """ This code does the following:
                1. Read audio,
                2. Resample the audio if required,
                3. Perform waveform normalization,
                4. Compute sound activity using threshold based method
                5. Discard the silence regions
        :param filepath: path to the audio file
        :return: a torch.Tensor with the audio samples and an int with the sample rate
        """

        s, fs = torchaudio.load(filepath)
        if fs != self.resampling_rate:
            s, fs = torchaudio.sox_effects.apply_effects_tensor(s, fs, [['rate', str(self.resampling_rate)]])
        if s.shape[0] > 1:
            s = s.mean(dim=0).unsqueeze(0)
        s = s / torch.max(torch.abs(s))
        sad = self.compute_sad(s.numpy(), self.resampling_rate)
        s = s[np.where(sad == 1)]
        return s, fs

    @staticmethod
    def compute_sad(sig, fs, threshold=0.0001, sad_start_end_sil_length=100, sad_margin_length=50):
        """ Compute threshold based sound activity """
        # Leading/Trailing margin
        sad_start_end_sil_length = int(sad_start_end_sil_length * 1e-3 * fs)
        # Margin around active samples
        sad_margin_length = int(sad_margin_length * 1e-3 * fs)

        sample_activity = np.zeros(sig.shape)
        sample_activity[np.power(sig, 2) > threshold] = 1
        sad = np.zeros(sig.shape)
        for i in range(sample_activity.shape[1]):
            if sample_activity[0, i] == 1:
                sad[0, i - sad_margin_length:i + sad_margin_length] = 1
        sad[0, 0:sad_start_end_sil_length] = 0
        sad[0, -sad_start_end_sil_length:] = 0
        return sad

    def _do_feature_extraction(self, s):
        """ Feature preparation
        Steps:
        1. Apply feature extraction to waveform
        2. Convert amplitude to dB if required
        3. Append delta and delta-delta features
        """

        if 'ComParE_2016' in self.args['feature_type']:

            # get a random string
            file_name = ''.join(random.choices(string.ascii_uppercase + string.digits, k=4))
            while os.path.exists(file_name):
                file_name = ''.join(random.choices(string.ascii_uppercase + string.digits, k=4))
            torchaudio.save(file_name + '.wav', s, sample_rate=self.resampling_rate)
            F = self.feature_transform.process_file(file_name + '.wav')

            # columns based selection
            os.remove(file_name + '.wav')

            # feature subsets
            feature_subset = {}
            if self.args['feature_type'] == 'ComParE_2016_voicing':
                feature_subset['subset'] = ['F0final_sma', 'voicingFinalUnclipped_sma', 'jitterLocal_sma',
                                            'jitterDDP_sma', 'shimmerLocal_sma', 'logHNR_sma']

            if self.args['feature_type'] == 'ComParE_2016_energy':
                feature_subset['subset'] = ['audspec_lengthL1norm_sma', 'audspecRasta_lengthL1norm_sma',
                                            'pcm_RMSenergy_sma', 'pcm_zcr_sma']

            if self.args['feature_type'] == 'ComParE_2016_spectral':
                feature_subset['subset'] = ['audSpec_Rfilt_sma[0]', 'audSpec_Rfilt_sma[1]', 'audSpec_Rfilt_sma[2]',
                                            'audSpec_Rfilt_sma[3]',
                                            'audSpec_Rfilt_sma[4]', 'audSpec_Rfilt_sma[5]', 'audSpec_Rfilt_sma[6]',
                                            'audSpec_Rfilt_sma[7]', 'audSpec_Rfilt_sma[8]', 'audSpec_Rfilt_sma[9]',
                                            'audSpec_Rfilt_sma[10]', 'audSpec_Rfilt_sma[11]', 'audSpec_Rfilt_sma[12]',
                                            'audSpec_Rfilt_sma[13]',
                                            'audSpec_Rfilt_sma[14]', 'audSpec_Rfilt_sma[15]', 'audSpec_Rfilt_sma[16]',
                                            'audSpec_Rfilt_sma[17]',
                                            'audSpec_Rfilt_sma[18]', 'audSpec_Rfilt_sma[19]', 'audSpec_Rfilt_sma[20]',
                                            'audSpec_Rfilt_sma[21]',
                                            'audSpec_Rfilt_sma[22]', 'audSpec_Rfilt_sma[23]', 'audSpec_Rfilt_sma[24]',
                                            'audSpec_Rfilt_sma[25]',
                                            'pcm_fftMag_fband250-650_sma', 'pcm_fftMag_fband1000-4000_sma',
                                            'pcm_fftMag_spectralRollOff25.0_sma',
                                            'pcm_fftMag_spectralRollOff50.0_sma', 'pcm_fftMag_spectralRollOff75.0_sma',
                                            'pcm_fftMag_spectralRollOff90.0_sma', 'pcm_fftMag_spectralFlux_sma',
                                            'pcm_fftMag_spectralCentroid_sma', 'pcm_fftMag_spectralEntropy_sma',
                                            'pcm_fftMag_spectralVariance_sma', 'pcm_fftMag_spectralSkewness_sma',
                                            'pcm_fftMag_spectralKurtosis_sma', 'pcm_fftMag_spectralSlope_sma',
                                            'pcm_fftMag_psySharpness_sma', 'pcm_fftMag_spectralHarmonicity_sma',
                                            'mfcc_sma[1]', 'mfcc_sma[2]', 'mfcc_sma[3]', 'mfcc_sma[4]', 'mfcc_sma[5]',
                                            'mfcc_sma[6]', 'mfcc_sma[7]', 'mfcc_sma[8]',
                                            'mfcc_sma[9]', 'mfcc_sma[10]', 'mfcc_sma[11]', 'mfcc_sma[12]',
                                            'mfcc_sma[13]', 'mfcc_sma[14]']

            if self.args['feature_type'] == 'ComParE_2016_mfcc':
                feature_subset['subset'] = ['mfcc_sma[1]', 'mfcc_sma[2]', 'mfcc_sma[3]', 'mfcc_sma[4]', 'mfcc_sma[5]',
                                            'mfcc_sma[6]', 'mfcc_sma[7]', 'mfcc_sma[8]',
                                            'mfcc_sma[9]', 'mfcc_sma[10]', 'mfcc_sma[11]', 'mfcc_sma[12]',
                                            'mfcc_sma[13]', 'mfcc_sma[14]']

            if self.args['feature_type'] == 'ComParE_2016_rasta':
                feature_subset['subset'] = ['audSpec_Rfilt_sma[0]', 'audSpec_Rfilt_sma[1]', 'audSpec_Rfilt_sma[2]',
                                            'audSpec_Rfilt_sma[3]',
                                            'audSpec_Rfilt_sma[4]', 'audSpec_Rfilt_sma[5]', 'audSpec_Rfilt_sma[6]',
                                            'audSpec_Rfilt_sma[7]', 'audSpec_Rfilt_sma[8]', 'audSpec_Rfilt_sma[9]',
                                            'audSpec_Rfilt_sma[10]', 'audSpec_Rfilt_sma[11]', 'audSpec_Rfilt_sma[12]',
                                            'audSpec_Rfilt_sma[13]',
                                            'audSpec_Rfilt_sma[14]', 'audSpec_Rfilt_sma[15]', 'audSpec_Rfilt_sma[16]',
                                            'audSpec_Rfilt_sma[17]',
                                            'audSpec_Rfilt_sma[18]', 'audSpec_Rfilt_sma[19]', 'audSpec_Rfilt_sma[20]',
                                            'audSpec_Rfilt_sma[21]',
                                            'audSpec_Rfilt_sma[22]', 'audSpec_Rfilt_sma[23]', 'audSpec_Rfilt_sma[24]',
                                            'audSpec_Rfilt_sma[25]']

            if self.args['feature_type'] == 'ComParE_2016_basic_spectral':
                feature_subset['subset'] = ['pcm_fftMag_fband250-650_sma', 'pcm_fftMag_fband1000-4000_sma',
                                            'pcm_fftMag_spectralRollOff25.0_sma',
                                            'pcm_fftMag_spectralRollOff50.0_sma', 'pcm_fftMag_spectralRollOff75.0_sma',
                                            'pcm_fftMag_spectralRollOff90.0_sma', 'pcm_fftMag_spectralFlux_sma',
                                            'pcm_fftMag_spectralCentroid_sma', 'pcm_fftMag_spectralEntropy_sma',
                                            'pcm_fftMag_spectralVariance_sma', 'pcm_fftMag_spectralSkewness_sma',
                                            'pcm_fftMag_spectralKurtosis_sma', 'pcm_fftMag_spectralSlope_sma',
                                            'pcm_fftMag_psySharpness_sma', 'pcm_fftMag_spectralHarmonicity_sma']

            if self.args['feature_type'] == 'ComParE_2016_llds':
                feature_subset['subset'] = list(F.columns)

            F = F[feature_subset['subset']].to_numpy()
            F = np.nan_to_num(F)
            F = torch.from_numpy(F).T

        if self.args['feature_type'] == 'MelSpec':
            F = self.feature_transform(s)

        if self.args['feature_type'] == 'logMelSpec':
            F = self.feature_transform(s)
            F = torchaudio.functional.amplitude_to_DB(F, multiplier=10, amin=1e-10, db_multiplier=0)

        if self.args['feature_type'] == 'MFCC':
            F = self.feature_transform(s)

        if self.args.get('compute_deltas', False):
            FD = torchaudio.functional.compute_deltas(F)
            F = torch.cat((F, FD), dim=0)

        if self.args.get('compute_delta_deltas', False):
            FDD = torchaudio.functional.compute_deltas(FD)
            F = torch.cat((F, FDD), dim=0)
        return F.T

    def extract(self, filepath):
        ''' Interface to other codes for this class
		Steps:
		1. Read audio
		2. Do feature extraction
		'''
        self.audio_path = filepath
        s, fs = self._read_audio(filepath)
        return self._do_feature_extraction(s)

In [78]:
# Feature Extractor
def make_feats(file_list, labels_file,feats_config):
    # read the list of files
    file_list = open(file_list).readlines()
    file_list = [line.strip().split() for line in file_list]

    # read labels
    temp = open(labels_file).readlines()
    temp = [line.strip().split() for line in temp]
    labels = {}
    for fil, label in temp:
        labels[fil] = label
    del temp

    # make examples
    egs = []
    for fil, path in file_list:
        # Prepare features
        FE = FeatureExtractor(feats_config)
        F = FE.extract(path)
        label = labels.get(fil, None)
        if label is not None:
            egs.append(np.concatenate((np.array(F), np.array([label] * F.shape[0]).reshape(F.shape[0], 1)), axis=1))

    egs = np.vstack(egs)

    return np.array(egs[:, :-1], dtype=float), np.array(egs[:, -1], dtype=float)

# Feature configuration
feature_config = {'feature_type': 'logMelSpec',
                  'resampling_rate': 44100,
                  'n_mels': 64,
                  'f_max': 22050,
                  'window_size': 25,
                  'hop_length': 10,
                  'compute_deltas': True,
                  'compute_delta_deltas': True}

#TODO: MAKE SCP FROM DATAFRAME
# Get Feats
train_feats, train_labels = make_feats(os.path.join(path_notebooks, 'train.scp'), os.path.join(path_notebooks, 'train_labels'), feature_config)

In [79]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence, pad_packed_sequence, pack_padded_sequence

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


# Torch utils
def activations(act):
    """
    Interface to fetch activations
    """
    activ = {'Tanh': nn.Tanh(), 'ReLU': nn.ReLU(), 'Sigmoid': nn.Sigmoid()}
    act = activ[act]

    if act is not None:
        return act
    else:
        raise ValueError('Unknown activation, add it in activations dictionary in models.py')

class bce_loss(nn.Module):
    """
    Class interface to compute BCE loss
    Default uses mean reduction equal weight for both positive and negative samples
    """

    def __init__(self, reduction='mean', pos_weight=torch.tensor([1])):
        super(bce_loss, self).__init__()
        self.criterion = nn.BCEWithLogitsLoss(reduction=reduction, pos_weight=pos_weight)

    def forward(self, net_out, ref):
        return self.criterion(net_out, ref)

class FFClassificationHead(nn.Module):
    def __init__(self, args):
        super(FFClassificationHead, self).__init__()

        self.inDim = args['input_dimension']
        self.units = [self.inDim] + [item for item in args['units'] if item > 0]
        self.num_layers = len(self.units) - 1

        self.activation_type = args['activation']
        self.dropout_p = args['dropout']

        for i in range(self.num_layers):
            setattr(self, 'linearlayer_' + str(i), nn.Linear(self.units[i], self.units[i + 1]))
            setattr(self, 'dropout_' + str(i), nn.Dropout(self.dropout_p))
        self.linearOut = nn.Linear(self.units[-1], 1)
        self.activation = activations(self.activation_type)

    def forward(self, inputs):

        x = torch.vstack(inputs)

        for i in range(self.num_layers):
            x = getattr(self, 'linearlayer_' + str(i))(x)
            x = self.activation(x)
            x = getattr(self, 'dropout_' + str(i))(x)
        x = self.linearOut(x)
        return [x[i, ] for i in range(x.shape[0])]

# LSTM ENCODER classifier
class LSTMEncoder(nn.Module):
    """ Stacked (B)LSTM Encoder
    Arguments:
    args: Dictionary with below entries
    input_dimenstion: (integer), Dimension of the feature vector input
    units: (integer), Number of LSTM units. Default: 128
    num_layers: (integer), Number of layers in the stacked LSTM. Default: 2
    bidirectional: (bool), if True biLSTM will be used. Default: True
    apply_mean_norm: (bool), subtract the example level mean. Default: False
    apply_var_norm: (bool), normalize by standard deviation. Default: False
    pooltype: (['average' or 'last']). Default: 'average'
    ----> 'average': average of the LSTM output along time dimension is the embedding
    ----> 'last': LSTM hidden state at the last time-step of the last layer is the embedding
    dropout: (float), Dropout probability. Default: 0
    """

    def __init__(self, args):
        super(LSTMEncoder, self).__init__()
        self.inDim = args['input_dimension']
        self.units = args.get('units', 128)
        self.num_layers = args.get('num_layers', 2)
        self.bidirectional = args.get('bidirectional', False)

        self.apply_mean_norm = args.get('apply_mean_norm', False)
        self.apply_var_norm = args.get('apply_var_norm', False)
        self.dropout_p = args.get('dropout', 0)
        assert self.dropout_p < 1

        self.pooltype = args.get('pooltype', False)
        assert self.pooltype in ['average', 'last']

        self.LSTM = nn.LSTM(self.inDim,
                            self.units,
                            num_layers=self.num_layers,
                            bidirectional=self.bidirectional,
                            batch_first=True,
                            dropout=self.dropout_p)

    def forward(self, inputs):
        """
        inputs: a list of torch tensors
        The tensors can be of varying length.
        """
        inlens = [x.shape[0] for x in inputs]
        if self.apply_mean_norm:
            inputs = [F - torch.mean(F, dim=0) for F in inputs]
        if self.apply_var_norm:
            inputs = [F / torch.std(F, dim=0) for F in inputs]

        x = pad_sequence(inputs, batch_first=True)
        x = pack_padded_sequence(x, inlens, batch_first=True, enforce_sorted=False)
        x, hc = self.LSTM(x)

        if self.pooltype == 'average':
            x, _ = pad_packed_sequence(x, batch_first=True)
            x = torch.sum(x, dim=1)
            x = torch.div(x, torch.tensor(inlens).unsqueeze(1).repeat(1, x.shape[1]).to(x.device))
        elif self.pooltype == 'last':
            if self.bidirectional:
                x = hc[0][-2:, :, :].transpose(0, 1).reshape(hc[0].shape[1], 2 * hc[0].shape[2])
            else:
                x = hc[0][-1, :, :]
        else:
            raise ValueError('Unknown pooling method')

        return [x[i, :].view(1, x.shape[1]) for i in range(x.shape[0])]

# LSTM classifier
class LSTMClassifier(nn.Module):
    """
    LSTM Classifier architecture
    """

    def __init__(self, args):
        super(LSTMClassifier, self).__init__()

        self.input_dimension = args['input_dimension']
        self.lstm_encoder_units = args['lstm_encoder_units']
        self.lstm_num_layers = args['lstm_num_layers']
        self.lstm_bidirectional = args['lstm_bidirectional']
        self.lstm_dropout_p = args['lstm_dropout']
        self.lstm_pooling = args['lstm_pooling']
        self.apply_mean_norm = args['apply_mean_norm']
        self.apply_var_norm = args['apply_var_norm']

        encoder_args = {'input_dimension': self.input_dimension, 'units': self.lstm_encoder_units,
                        'num_layers': self.lstm_num_layers, 'bidirectional': self.lstm_bidirectional,
                        'apply_mean_norm': self.apply_mean_norm, 'apply_var_norm': self.apply_var_norm,
                        'dropout': self.lstm_dropout_p, 'pooltype': self.lstm_pooling}

        self.encoder = LSTMEncoder(encoder_args)

        temp = args['classifier_units']
        if type(temp) == list:
            self.classifier_units = temp
        else:
            self.classifier_units = [temp]
        self.classifier_activation = args['classifier_activation']
        self.classifier_dropout_p = args['classifier_dropout']
        cls_idim = 2 * self.lstm_encoder_units if self.lstm_bidirectional else self.lstm_encoder_units
        classifier_args = {'input_dimension': cls_idim, 'units': self.classifier_units,
                           'dropout': self.classifier_dropout_p, 'activation': self.classifier_activation}

        self.classifier = FFClassificationHead(classifier_args)
        self.criterion = bce_loss()

    def init_encoder(self, params):
        """
        Initialize the feature encoder using a pre-trained model
        """
        self.encoder.load_state_dict(params)

    def init_classifier(self, params):
        """
        Initialize the classification-head using a pre-trained classifier model
        """
        self.classifier.load_state_dict(params)

    def predict(self, inputs):
        """
        Prediction of the classifier score
        """
        return self.classifier(self.encoder(inputs))

    def predict_proba(self, inputs):
        """
        Prediction of the posterior probability
        """
        return [torch.sigmoid(item) for item in self.predict(inputs)]

    def forward(self, inputs, targets):
        """
        Forward pass through the network and loss computation
        """
        return self.criterion(torch.stack(self.predict(inputs)), torch.stack(targets))

# Models configurations
ALL_MODELS = {'LogisticRegression': {'c': 0.01,
                                     'max_iter': 40,
                                     'solver': 'liblinear',
                                     'penalty': 'l2',
                                     'class_weight': 'balanced', 'random_state': SEED, 'verbose': True},
              'RandomForest': {'n_estimators': 20,
                               'criterion': 'gini',
                               'max_depth': None,
                               'min_samples_split': 2,
                               'min_samples_leaf': 1,
                               'max_features': 'sqrt',
                               'class_weight': 'balanced', 'random_state': SEED, 'verbose': True},
              'MLP': {'learning_rate_init': 0.001, 'alpha': 0.001, 'solver': 'adam', 'hidden_layer_sizes': [20, 20], 'max_iter': 500, 'activation': 'tanh',
                      'class_weight': 'balanced', 'random_state': SEED, 'verbose': True},
              'linearSVM': {'penalty': 'l2',
                            'loss': 'squared_hinge',
                            'c': 0.01,
                            'tol': 1e-4,
                            'max_iter': 100,
                            'class_weight': 'balanced', 'random_state': SEED, 'verbose': True}
              }

# Training function
def config_model(model_name, training_feats, training_labels):

    model_args = ALL_MODELS[model_exp]

    if model_name == 'LogisticRegression':
        model = LogisticRegression(C=float(model_args['c']),
                                   max_iter=int(model_args['max_iter']),
                                   solver=model_args['solver'],
                                   penalty=model_args['penalty'],
                                   class_weight=model_args['class_weight'],
                                   random_state=model_args['random_state'],
                                   verbose=True)

    elif model_name == 'RandomForest':
        model = RandomForestClassifier(n_estimators=model_args['n_estimators'],
                                       criterion=model_args['criterion'],
                                       max_depth=model_args['max_depth'],
                                       min_samples_split=model_args['min_samples_split'],
                                       min_samples_leaf=model_args['min_samples_leaf'],
                                       max_features=model_args['max_features'],
                                       class_weight=model_args['class_weight'],
                                       random_state=model_args['random_state'])

    elif model_name == 'LinearSVM':
        model = SVC(penalty=model_args['penalty'],
                    loss=model_args['loss'],
                    C=model_args['c'],
                    tol=model_args['tol'],
                    max_iter = model_args['max_iter'],
                    verbose=model_args['verbose'],
                    class_weight=model_args['class_weight'],
                    random_state=model_args['random_state'])

    elif model_name == 'MLP':
        model = MLPClassifier(hidden_layer_sizes=model_args['hidden_layer_sizes'],
                              solver=model_args['solver'], alpha=model_args['alpha'],
                              learning_rate_init=model_args['learning_rate_init'],
                              verbose=model_args['verbose'], activation=model_args['activation'],
                              max_iter=model_args['max_iter'], random_state=model_args['random_state'])

        if model_args['class_weight'] == 'balanced':
            train_data = np.concatenate((training_feats, training_labels.reshape(training_feats.shape[0], 1)), axis=1)
            ind = np.where(train_data[:, -1] == 1)[0]
            n_positives = len(ind)
            n_negatives = train_data.shape[0] - n_positives
            up_sample_factor = int(n_negatives / n_positives) - 1
            for i in range(up_sample_factor):
                train_data = np.concatenate((train_data, train_data[ind, :]), axis=0)
            np.random.shuffle(train_data)
            training_feats = train_data[:, :-1]
            training_labels = train_data[:, -1]


    else:
        raise ValueError("Not implementation of the model: " + model_exp)
    return model, training_feats, training_labels

In [80]:
#Select a model
model_exp = 'LogisticRegression'
# Start training
model, x_train, y_train  = config_model(model_exp, train_feats, train_labels)

In [81]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

METRICS = ['roc_auc', 'accuracy', 'f1', 'precision', 'recall']
cv = StratifiedKFold(n_splits=K_NUM, random_state=SEED, shuffle=True)

scores = {}
for metric in METRICS:
    scores[metric] = list(cross_val_score(model, x_train, y_train, scoring=metric, cv=cv))

print(scores)

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]{'roc_auc': [1.0, 1.0, 0.9976744186046511, 1.0, 0.9995210727969349], 'accuracy': [0.9910714285714286, 0.990990990990991, 0.9819819819819819, 0.990990990990991, 0.990990990990991], 'f1': [0.9803921568627451, 0.9803921568627451, 0.9615384615384615, 0.9795918367346939, 0.9795918367346939], 'precision': [0.9615384615384616, 0.9615384615384616, 0.9259259259259259, 1.0, 0.96], 'recall': [1.0, 1.0, 1.0, 0.96, 1.0]}
