# i-vectors with bob package


In [11]:
# BOB idiap library impots
import bob.learn.em
import bob.bio.gmm
import multiprocessing.pool

# Basic ML libraries
import numpy
import pandas as pd 
import numpy as np

# Importing basic functions from other notebooks 
import import_ipynb
from analyze_data_cleaned import define_data_type
from analyze_data_cleaned import interesting_patients
from analyze_data_cleaned import apply_mask

In [15]:
data_dir='/home/sjoshi/codes/python/BeatPD/data/BeatPD/'

In [7]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("test")

DEBUG:root:test


In [51]:
data_type='cis'

path_train_data, df_train_label = define_data_type(data_type=data_type)

display(df_train_label)
# Filter df_train_label according to the measurement_id we are most interested in
#df_train_label = interesting_patients(df_train_label=df_train_label, list_measurement_id=list_measurement_id)

def prepare_features(df_train_label, nb_features, velocity_path=None, mask_path=None):
    """
    Prepare the features to train the UBM and i-vectors 
    
    Keyword arguments:
    - df_train_label: The labels and the list of measurement_id we want to load and use to train
    - nb_features: Nb of features to be used. 
       - 3 for original X, Y, Z of the accelerometer
       - 3 for high pass filtered X, Y, Z of the velocity obtained with derivative of original accelerometer
    - velocity_path: Optional. If provided, then we will use the velocity features  
    - mask_path: Optinal. If provided, then we will use the mask feature.
                 If velocity uses high pass filter, then we must provide the mask path 
    """
    # Array concatenating the training data so we can train the ubm with everything
    nd_train_data = np.empty((0,nb_features), float)
    
    # List of the "classes" which every measurement_id is a class for us
    list_train_data = []
    
    for idx in df_train_label.index:
        df_train_data=pd.read_csv(path_train_data+df_train_label["measurement_id"][idx]+'.csv')
        
        # Apply mask on df_train_data if provided
        if mask_path is not None:
            df_train_data = apply_mask(df_train_label["measurement_id"][idx], mask_path)
            print(len(df_train_data))
        # For now the velocity is always filtered by the high pass so that's why the following if
        # is inside the mask condition. To have only velocity without filter, it would be required
        # to first save these files in a folder 
        if velocity_path is not None and mask_path is not None:
            df_velocity=pd.read_csv(data_dir+
                                velocity_path+
                                df_train_label["measurement_id"][idx]+'.csv')
            df_velocity = df_velocity.iloc[:-3,:]
            # Concatenate the velocity and the original training data
            result = pd.concat([df_train_data, df_velocity], axis=1)
            display(result)
                
        # confirm result is accessible
        x = result.iloc[:,-nb_features:]
       
        # Normalize the data around mean & std because that is what bob library is expecting 
        normed_x = (x - x.mean(axis=0)) / x.std(axis=0)
        list_train_data.append(normed_x.to_numpy())

        nd_train_data = np.append(nd_train_data, normed_x.to_numpy(), axis=0)

    return list_train_data, nd_train_data

Unnamed: 0,measurement_id,subject_id,on_off,dyskinesia,tremor
0,cc7b822c-e310-46f0-a8ea-98c95fdb67a1,1004,1.0,1.0,1.0
1,5163afe8-a6b0-4ea4-b2ba-9b4501dd5912,1004,0.0,0.0,0.0
2,5cf68c8e-0b7a-4b73-ad4f-015c7a20fb5a,1004,1.0,1.0,1.0
3,fb188ae2-2173-4137-9236-19a137a402c2,1004,3.0,3.0,3.0
4,19a3e9ea-fce1-40b7-9457-2618970beb7b,1004,1.0,1.0,1.0
...,...,...,...,...,...
1853,ab618d1f-eb27-442b-a3b7-e438fde70db1,1051,0.0,,
1854,25e8bca2-051f-4216-826b-810bbddfdb2e,1051,0.0,,
1855,7f7bb7b9-8656-40dd-94f9-9d546ac75722,1051,0.0,,
1856,c29c2d91-c294-4655-a2a7-d4c1f456c3a2,1051,1.0,,


In [42]:
def train_save_ubm(nd_train_data, ubm_name, g=256):
    """
    Trains and save the UBM in a file to be later used when training ivectors
    
    Keyword Arguments:
    - nd_train_data: Array of all the training data concatenated
    - ubm_name: Name of the file that we are going to save with the UBM trained
    - g: Nb of gaussians
    """
    # Training UBM
    gmm = bob.bio.gmm.algorithm.GMM(number_of_gaussians=g)
    gmm.train_ubm(nd_train_data)
    gmm.project_ubm(nd_train_data)
    prior_gmm = gmm.ubm
    print(gmm)
    # Save the UBM 
    gmm.save_ubm(data_dir+ubm_name)

def train_ivectors(list_train_data, ubm_path, ivector_machine_path, g=256, subspace_dimension_of_t=2, nb_features=6):
    """
    Train an ivector and saves the iVector Machine to be used later 
    
    Keyword arguments:
    - list_train_data: List of numpy arrays containing the features for each measurement_id
       (which are the classes)
    - ubm_path: Full path to where the UBM was trained & save 
    - ivector_machine_path: Path and name of the file where we want to save the iVectorMachine
    - g: Nb of gaussians. Optional. Default is 256. 
    - subspace_dimension_of_t: ??? 
    - nb_features: Number of features we are using. Can be 3 or 6.
    """
    gmm = bob.bio.gmm.algorithm.GMM(number_of_gaussians=g)
#     print('before')
#     print(gmm.ubm)
    # FIXME not sure if I should use load_projector or load_ubm as load_projector calls load_ubm anyway? 
    #gmm.load_ubm(ubm_path)
    gmm.load_projector(ubm_path)
    prior_gmm = gmm.ubm
#     print('after')
#     print(prior_gmm)
    
    # The input the the TV Training is the statistics of the GMM
    gmm_stats_per_class = []

    # Get the gmm stats for each class
    for d in list_train_data:
        for i in d:
            gmm_stats_container = bob.learn.em.GMMStats(g, nb_features)
            prior_gmm.acc_statistics(i, gmm_stats_container)
            gmm_stats_per_class.append(gmm_stats_container)


    ### Finally doing the TV training
    ivector_trainer = bob.learn.em.IVectorTrainer(update_sigma=True)
    # IVectorMachine: Statistical model for the Total Variability training for more
    ivector_machine = bob.learn.em.IVectorMachine(
            prior_gmm, subspace_dimension_of_t, 10e-5)

    pool = multiprocessing.ThreadPool(8)

    # train IVector model
    bob.learn.em.train(ivector_trainer, ivector_machine,
                            gmm_stats_per_class, 500, pool=pool)

    # Printing the session offset w.r.t each Gaussian component
    # Returns the Total Variability matrix, T
    print(ivector_machine.t)

    print(ivector_machine.ubm)

    ivector_machine.save(ivector_machine_path)


In [43]:
list_train_data, nd_train_data = prepare_features(df_train_label,
                                    nb_features=6,
                                    mask_path='/home/sjoshi/codes/python/BeatPD/data/BeatPD/cis-pd.training_data.high_pass_mask/',
                                    velocity_path='cis-pd.training_data.velocity_original_data/')

train_save_ubm(nd_train_data,
               ubm_name='gmm.hdf5',
               g=256)

train_ivectors(list_train_data,
               ubm_path=data_dir+'gmm.hdf5',
               ivector_machine_path=data_dir+'ivectormachine_bob.hdf5',
               g=256,
               subspace_dimension_of_t=2,
               nb_features=6)

measurement_id  cc7b822c-e310-46f0-a8ea-98c95fdb67a1
27551


Unnamed: 0,Timestamp,X,Y,Z,X_velocity,Y_velocity,Z_velocity
0,0.000,-1.090332,-0.237793,0.395508,-0.256008,-0.018363,0.007159
1,0.020,-1.267822,-0.102051,0.172852,-0.217250,-0.001718,-0.022199
2,0.040,-1.185791,-0.089600,0.017090,-0.136815,0.017299,-0.042725
3,0.060,-1.176270,-0.077637,-0.002197,-0.256008,-0.018363,0.007159
4,0.080,-1.211670,-0.013672,-0.079590,-0.217250,-0.001718,-0.022199
...,...,...,...,...,...,...,...
27546,549.433,0.025879,-0.031250,-0.985352,-0.000035,-0.000052,0.000035
27547,549.453,0.027832,-0.032227,-0.985840,0.000140,-0.000157,0.000070
27548,549.473,0.026367,-0.032715,-0.985840,0.000087,-0.000157,0.000017
27549,549.493,0.025391,-0.031738,-0.985840,0.000122,-0.000122,-0.000157


<class 'bob.bio.gmm.algorithm.GMM'>(number_of_gaussians=256, kmeans_training_iterations=25, gmm_training_iterations=25, training_threshold=0.0005, variance_threshold=0.0005, update_weights=True, update_means=True, update_variances=True, relevance_factor=4, gmm_enroll_iterations=1, responsibility_threshold=0, INIT_SEED=5489, scoring_function='<built-in function linear_scoring>', multiple_probe_scoring='average')
before
None
after
<bob.learn.em.GMMMachine object at 0x7fe5a7e92410>


NameError: name 'multiprocessing' is not defined

In [None]:
# ivectors projected 
# ivector_machine.project(gmm_stats_per_class)