In [111]:
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier
import csv
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from random import random
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
import pandas as pd
import math

In [223]:
def final_score(mse_per_subjectid, nb_files_per_subject_id, training_or_test=''):
    """
    Compute the final score for the challenge given the arguments 
    
    Keyword arguments:
    - mse_per_subjectid: list of the mse per subject_id 
    - nb_files_per_subject_id: list of the number of files per subject_id 
    - training_or_test: string just for the purpose of printing the result 
    """
    numerator = np.sum([nb_file * mse for nb_file, mse in zip(np.sqrt(nb_files_per_subject_id), mse_per_subjectid)])
    denominator = np.sum(np.sqrt(nb_files_per_subject_id))
    #FIXME : Refactor so it's not printing by default 
    print(training_or_test+'Final score : ', np.divide(numerator, denominator))
    return np.divide(numerator, denominator)

In [224]:
def get_final_score(vPredictions, vParID, vTrueLabels):
    """
    Compute the final score from the challenge and print the result
    Keyword arguments: 
    - vPredictions: Numpy array containing the predictions 
    - VParID: list containing the subject_id 
    - vTrueLabels: list containing the true labels 
    """
    mse_per_subjectId = []
    nb_files_per_subjectId = []
    for subject_id in np.unique(vParID):
        #print('--- SUBJECT ID ', subject_id, '---')
        vSubjectId = (vParID == subject_id)
        #print(vSubjectId)
        vPredictions_subjectId = vPredictions[vSubjectId]
        vTrueLabels_subjectId = np.array(vTrueLabels)[vSubjectId]
        mse_per_subjectId.append(mean_squared_error(vTrueLabels_subjectId, vPredictions_subjectId))
        nb_files_per_subjectId.append(len(vPredictions_subjectId))
        #print('MSE : ', mean_squared_error(vTrueLabels_subjectId, vPredictions_subjectId))
    print('--- MSEscore ---')
    final_score(mse_per_subjectId, nb_files_per_subjectId)


In [225]:
def RandomForest_fusion_Cross (lFilesPred, sFileLabels, iEstimators, nudge, rLR, iMD,iRS):
    
    # Read file labels (true labels)
    
    with open(sFileLabels, mode='r') as infile:
        reader = csv.reader(infile)
        dLabels= {rows[0]:rows[1] for rows in reader} #participantID:label
    #print('thisis dLabels-------')
    #print(dLabels)
    
    # Training-testing data
    iNumFiles=len(lFilesPred)
#mPredictions=[] #np.zeros((1,iNumFiles))
#mPredictions=np.asarray(mPredictions)
    vPredIter=np.zeros((1,iNumFiles))
    vLabels=[] #true label
    lDicts=[] 

    for sFilePred in lFilesPred:
        with open(sFilePred, mode='r') as infile:
            reader = csv.reader(infile)
            dPred = {rows[0]:rows[1] for rows in reader} #Prediction from the different classifiers (acoustic, w-embed)
        lDicts.append(dPred)
   
    #creation of the matrix containing prediction from all classifiers
    bEnter=1
    for k in lDicts[0]: #first dictionary will be the lead
        if k!='measurement_id':
            #print(k)
            #print(dID[k])
            if dLabels[k]!='NA':
                vLabels.append(float(np.asarray(dLabels[k]))) #true labels
                vPredIter[0,0]=float(np.asarray(lDicts[0][k])) #first predicted value
                #print(vPredIter)
                for j in range(1, iNumFiles):
                    fPred=lDicts[j].get(k)
                if fPred:
                    vPredIter[0,j]=float(np.asarray(fPred))
                else:
                    print(['Unkwnown key:' + k])
                    vPredIter[0,j]=float(np.asarray(lDicts[0][k]))
                    
            if bEnter==1:
                mPredictions=vPredIter # mPredictions initialization.
                bEnter=0
            else:
                mPredictions=np.append(mPredictions,vPredIter,axis=0)
         
    print('mPredictions and vLabels')
    print(mPredictions.shape)
    print(len(vLabels))
    
    vRes1=np.transpose(mPredictions[:,[0]])
    vRes2=np.transpose(mPredictions[:,[1]])
 
    # Random forest training - regression
    
    
#clf=RandomForestClassifier(n_estimators)
#clf = clf.fit(X, Y)
#clf.score(X_test, y_test)

    #scores = cross_val_score(clf, mPredictions, vLabels, cv=15)
    #print('Cross-validation score:')
    #print(scores.mean())

    
    vIndex=np.arange(0, len(vLabels)-1, nudge).tolist()
    vPredCross=[];
    vTrueCross=[];
    
    for ind in range(len(vIndex)-1):
        #print('New iteration:')
        #print(ind)
        mPredAux=np.delete(mPredictions, slice(vIndex[ind], vIndex[ind+1]),axis=0)
        vLabelsAux=np.delete(vLabels, slice(vIndex[ind], vIndex[ind+1]),axis=0)
        mPredTest=mPredictions[vIndex[ind]:vIndex[ind+1],:]
        clf=GradientBoostingRegressor(n_estimators=iEstimators, learning_rate=rLR, max_depth=iMD,\
                                      random_state=iRS, loss='ls').fit(mPredAux, vLabelsAux)
        vLabelSel=vLabels[vIndex[ind]:vIndex[ind+1]] # Contains the selected labels
        vNewPred=clf.predict(mPredTest)
        vPredCross.extend(vNewPred)
        vTrueCross.extend(vLabelSel)
        #print(len(vNewPred))
        #print(len(vLabelsAux))
    
   
    # we will include the testing data here
    return (np.array(vPredCross), np.array(vTrueCross), np.array(vLabels), vRes1, vRes2)
#print(mPredictions)

In [226]:
def average_fusion_cross(lFilesPred, sFileLabels, nudge, bRound):
    
    with open(sFileLabels, mode='r') as infile:
        reader = csv.reader(infile)
        dLabels= {rows[0]:rows[1] for rows in reader} #participantID:label
    #print('thisis dLabels-------')
    #print(dLabels)
    
    # Training-testing data
    iNumFiles=len(lFilesPred)
#mPredictions=[] #np.zeros((1,iNumFiles))
#mPredictions=np.asarray(mPredictions)
    vPredIter=np.zeros((1,iNumFiles))
    vLabels=[] #true label
    lDicts=[] 

    for sFilePred in lFilesPred:
        with open(sFilePred, mode='r') as infile:
            reader = csv.reader(infile)
            dPred = {rows[0]:rows[1] for rows in reader} #Prediction from the different classifiers (acoustic, w-embed)
        lDicts.append(dPred)
   
    #creation of the matrix containing prediction from all classifiers
    bEnter=1
    for k in lDicts[0]: #first dictionary will be the lead
        if k!='measurement_id':
            #print(k)
            #print(dID[k])
            if dLabels[k]!='NA':
                vLabels.append(float(np.asarray(dLabels[k]))) #true labels
                vPredIter[0,0]=float(np.asarray(lDicts[0][k])) #first predicted value
                #print(vPredIter)
                for j in range(1, iNumFiles):
                    fPred=lDicts[j].get(k)
                if fPred:
                    vPredIter[0,j]=float(np.asarray(fPred))
                else:
                    print(['Unkwnown key:' + k])
                    vPredIter[0,j]=float(np.asarray(lDicts[0][k]))
                    
            if bEnter==1:
                mPredictions=vPredIter # mPredictions initialization.
                bEnter=0
            else:
                mPredictions=np.append(mPredictions,vPredIter,axis=0)
         
    print('mPredictions and vLabels')
    print(mPredictions.shape)
    print(len(vLabels))
    if bRound==1:
        mPredictions=np.round(mPredictions)
        
    
    vRes1=np.transpose(mPredictions[:,[0]])
    vRes2=np.transpose(mPredictions[:,[1]])
 
    # Random forest training - regression
    
    
#clf=RandomForestClassifier(n_estimators)
#clf = clf.fit(X, Y)
#clf.score(X_test, y_test)

    #scores = cross_val_score(clf, mPredictions, vLabels, cv=15)
    #print('Cross-validation score:')
    #print(scores.mean())

    
    vIndex=np.arange(0, len(vLabels)-1, nudge).tolist()
    vPredCross=[];
    vTrueCross=[];
    
    for ind in range(len(vIndex)-1):
        #print('New iteration:')
        #print(ind)
        mPredAux=np.delete(mPredictions, slice(vIndex[ind], vIndex[ind+1]),axis=0)
        vLabelsAux=np.delete(vLabels, slice(vIndex[ind], vIndex[ind+1]),axis=0)
        mPredTest=mPredictions[vIndex[ind]:vIndex[ind+1],:]
        vLabelSel=vLabels[vIndex[ind]:vIndex[ind+1]] # Contains the selected labels
        vNewPred=np.mean(mPredTest, axis=1)
        vPredCross.extend(vNewPred)
        vTrueCross.extend(vLabelSel)
        #print(len(vNewPred))
        #print(len(vLabelsAux))
    
   
    # we will include the testing data here
    return (np.array(vPredCross), np.array(vTrueCross), np.array(vLabels), vRes1, vRes2)
#print(mPredictions)

In [227]:
def average_fusion_cross_2(lFilesPred, sFileLabels, nudge, bRound):
    
    with open(sFileLabels, mode='r') as infile:
        reader = csv.reader(infile)
        dLabels= {rows[0]:rows[1] for rows in reader} #participantID:label
    #print('thisis dLabels-------')
    #print(dLabels)
    
    # Training-testing data
    iNumFiles=len(lFilesPred)
#mPredictions=[] #np.zeros((1,iNumFiles))
#mPredictions=np.asarray(mPredictions)
    vPredIter=np.zeros((1,iNumFiles))
    vLabels=[] #true label
    lDicts=[] 

    for sFilePred in lFilesPred:
        with open(sFilePred, mode='r') as infile:
            reader = csv.reader(infile)
            dPred = {rows[0]:rows[1] for rows in reader} #Prediction from the different classifiers (acoustic, w-embed)
        lDicts.append(dPred)
   
    #creation of the matrix containing prediction from all classifiers
    bEnter=1
    for k in lDicts[0]: #first dictionary will be the lead
        if k!='measurement_id':
            #print(k)
            #print(dID[k])
            if dLabels[k]!='NA':
                vLabels.append(float(np.asarray(dLabels[k]))) #true labels
                vPredIter[0,0]=float(np.asarray(lDicts[0][k])) #first predicted value
                #print(vPredIter)
                for j in range(1, iNumFiles):
                    fPred=lDicts[j].get(k)
                if fPred:
                    vPredIter[0,j]=30*float(np.asarray(fPred)) 
                else:
                    print(['Unkwnown key:' + k])
                    vPredIter[0,j]=float(np.asarray(lDicts[0][k]))
                    
            if bEnter==1:
                mPredictions=vPredIter # mPredictions initialization.
                bEnter=0
            else:
                mPredictions=np.append(mPredictions,vPredIter,axis=0)
         
    print('mPredictions and vLabels')
    print(mPredictions.shape)
    print(len(vLabels))
    if bRound==1:
        mPredictions=np.round(mPredictions)
        
    
    vRes1=np.transpose(mPredictions[:,[0]])
    vRes2=np.transpose(mPredictions[:,[1]])
 
    # Random forest training - regression
    
    
#clf=RandomForestClassifier(n_estimators)
#clf = clf.fit(X, Y)
#clf.score(X_test, y_test)

    #scores = cross_val_score(clf, mPredictions, vLabels, cv=15)
    #print('Cross-validation score:')
    #print(scores.mean())

    
    vIndex=np.arange(0, len(vLabels)-1, nudge).tolist()
    vPredCross=[];
    vTrueCross=[];
    
    for ind in range(len(vIndex)-1):
        #print('New iteration:')
        #print(ind)
        mPredAux=np.delete(mPredictions, slice(vIndex[ind], vIndex[ind+1]),axis=0)
        vLabelsAux=np.delete(vLabels, slice(vIndex[ind], vIndex[ind+1]),axis=0)
        mPredTest=mPredictions[vIndex[ind]:vIndex[ind+1],:]
        vLabelSel=vLabels[vIndex[ind]:vIndex[ind+1]] # Contains the selected labels
        vNewPred=np.mean(mPredTest, axis=1)
        vPredCross.extend(vNewPred)
        vTrueCross.extend(vLabelSel)
        #print(len(vNewPred))
        #print(len(vLabelsAux))
    
   
    # we will include the testing data here
    return (np.array(vPredCross), np.array(vTrueCross), np.array(vLabels), vRes1, vRes2)
#print(mPredictions)

In [228]:
def average_fusion (lFilesPred, dest_dir, fileName, bRound):
    
    # Read file labels (true labels)
    
    iNumFiles=len(lFilesPred)
    print('Number of analyzed file inputs: '+ str(iNumFiles))
#mPredictions=[] #np.zeros((1,iNumFiles))
#mPredictions=np.asarray(mPredictions)
    vPredIter=np.zeros((1,iNumFiles))
    lID=[] #true label
    lDicts=[] 

    for sFilePred in lFilesPred:
        with open(sFilePred, mode='r') as infile:
            reader = csv.reader(infile)
            dPred = {rows[0]:rows[1] for rows in reader} #Prediction from the different classifiers (acoustic, w-embed)
        lDicts.append(dPred)
   
    #creation of the matrix containing prediction from all classifiers
    bEnter=1
    for k in lDicts[0]: #first dictionary will be the lead
        if k!='measurement_id':
            lID.append(k)
            vPredIter[0,0]=float(np.asarray(lDicts[0][k])) #first predicted value
            #print(vPredIter)
            for j in range(1, iNumFiles):
                fPred=lDicts[j].get(k)
                if fPred:
                    vPredIter[0,j]=float(np.asarray(fPred))
                else:
                    print(['Unkwnown key:' + k])
                    vPredIter[0,j]=float(np.asarray(lDicts[0][k]))

            if bEnter==1:
                mPredictions=vPredIter # mPredictions initialization.
                bEnter=0
            else:
                mPredictions=np.append(mPredictions,vPredIter,axis=0)

    print('mPredictions and vID')
    print(mPredictions.shape)
    print(len(lID))
    print(mPredictions)
    vAverage=np.mean(mPredictions,axis=1)
    if bRound==1:
        vPrediction=np.round(vAverage)
    else:
        vPrediction=vAverage
    
    
          
    # Random forest training - regression
    
    
#clf=RandomForestClassifier(n_estimators)
#clf = clf.fit(X, Y)
#clf.score(X_test, y_test)

    #scores = cross_val_score(clf, mPredictions, vLabels, cv=15)
    #print('Cross-validation score:')
    #print(scores.mean())
    lID, vPrediction=zip(*sorted(zip(lID, vPrediction)))
    df = pd.DataFrame({'ID': lID, 'Prediction':vPrediction})
    np.savetxt(dest_dir+fileName+'.txt', df.values, fmt='%s', delimiter=' ; ')
   
   
    # we will include the testing data here
    
#print(mPredictions)
    return vAverage, lID

In [222]:
def average_fusion_tocsv (lFilesPred, dest_dir, fileName, bRound):
    
    # Read file labels (true labels)
    
    iNumFiles=len(lFilesPred)
    print('Number of analyzed file inputs: '+ str(iNumFiles))
#mPredictions=[] #np.zeros((1,iNumFiles))
#mPredictions=np.asarray(mPredictions)
    vPredIter=np.zeros((1,iNumFiles))
    lID=[] #true label
    lDicts=[] 

    for sFilePred in lFilesPred:
        with open(sFilePred, mode='r') as infile:
            reader = csv.reader(infile)
            dPred = {rows[0]:rows[1] for rows in reader} #Prediction from the different classifiers (acoustic, w-embed)
        lDicts.append(dPred)
   
    #creation of the matrix containing prediction from all classifiers
    bEnter=1
    for k in lDicts[0]: #first dictionary will be the lead
        if k!='measurement_id':
            lID.append(k)
            vPredIter[0,0]=float(np.asarray(lDicts[0][k])) #first predicted value
            #print(vPredIter)
            for j in range(1, iNumFiles):
                fPred=lDicts[j].get(k)
                if fPred:
                    vPredIter[0,j]=float(np.asarray(fPred))
                else:
                    print(['Unkwnown key:' + k])
                    vPredIter[0,j]=float(np.asarray(lDicts[0][k]))

            if bEnter==1:
                mPredictions=vPredIter # mPredictions initialization.
                bEnter=0
            else:
                mPredictions=np.append(mPredictions,vPredIter,axis=0)

    print('mPredictions and vID')
    print(mPredictions.shape)
    print(len(lID))
    print(mPredictions)
    vAverage=np.mean(mPredictions,axis=1)
    if bRound==1:
        vPrediction=np.round(vAverage)
        vPrediction=vPrediction.clip(min=0)
    else:
        vPrediction=vAverage*30
    
    lID, vPrediction=zip(*sorted(zip(lID, vPrediction)))
          
    # Random forest training - regression
    
    
#clf=RandomForestClassifier(n_estimators)
#clf = clf.fit(X, Y)
#clf.score(X_test, y_test)

    #scores = cross_val_score(clf, mPredictions, vLabels, cv=15)
    #print('Cross-validation score:')
    #print(scores.mean())

    

    df = pd.DataFrame({'measurement_id': lID, 'prediction':vPrediction})
    df.to_csv(dest_dir+fileName+'.csv', index=False)
    print(dest_dir+fileName+'.csv')
   
    # we will include the testing data here
    
#print(mPredictions)
    return vAverage, lID

In [6]:
def calculateaccu(vPred, vTrue):

    accu=1-np.mean(np.absolute(np.round(vPred)-vTrue))
    print('Accuracy: ' + str(accu))

In [120]:
def calculateMSE(vPred,vTrue):
    MSE=math.sqrt(np.mean(np.square(vPred-vTrue)))
    print('MSE:'+str(MSE))
    

In [30]:
# Using Raghu's acoustic features - DETECTION
sFilePred1='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/AcousticScores.csv'
sFilePred2='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/BertScores.csv'
sFileLabels='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/TrueLabels.csv'

lFilesPred=[sFilePred1,sFilePred2];

nudge=1
iEstimators=900
rLR=0.1
iMD=1
iRS=0

vPredCross, vTrueCross, vLabels, vRes1, vRes2=\
RandomForest_fusion_Cross (lFilesPred, sFileLabels, iEstimators, nudge, rLR, iMD,iRS)

print('Global: ')
calculateaccu(vPredCross, vTrueCross)
print('Acoustic model: ')
calculateaccu(vRes1, vLabels)

print('NLP model: ')
calculateaccu(vRes2, vLabels)


mPredictions and vLabels
(108, 2)
108
Global: 
Accuracy: 0.7924528301886793
Acoustic model: 
Accuracy: 0.6388888888888888
NLP model: 
Accuracy: 0.7777777777777778


In [31]:
# Using xvector-plda acoustic features -DETECTION
sFilePred1='/export/c08/lmorove1/kaldi/egs/xVecAD/v1/exp/3ann/resBestxVecFold_all/objs_35_.csv'
sFilePred2='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/BertScores.csv'
sFileLabels='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/TrueLabels.csv'

lFilesPred=[sFilePred1,sFilePred2];

nudge=1
iEstimators=900
rLR=0.1
iMD=1
iRS=0

vPredCross, vTrueCross, vLabels, vRes1, vRes2=\
RandomForest_fusion_Cross (lFilesPred, sFileLabels, iEstimators, nudge, rLR, iMD,iRS)

print('Global: ')
calculateaccu(vPredCross, vTrueCross)

print('Acoustic model: ')
calculateaccu(vRes1, vLabels)

print('NLP model: ')
calculateaccu(vRes2, vLabels)

mPredictions and vLabels
(108, 2)
108
Global: 
Accuracy: 0.7641509433962264
Acoustic model: 
Accuracy: 0.7314814814814814
NLP model: 
Accuracy: 0.7777777777777778


In [124]:
# Using xvector-plda acoustic features - MMSE
sFilePred1='/export/c08/lmorove1/kaldi/egs/xVecAD/v1/exp/3ann/resBestxVecFold_all/MMSE_crossValidation.csv'
sFilePred2='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/BertMMSE_crossval.csv'
sFileLabels='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/TrueMSE.csv'

lFilesPred=[sFilePred1,sFilePred2];

nudge=1
iEstimators=900
rLR=0.1
iMD=1
iRS=0

vPredCross, vTrueCross, vLabels, vRes1, vRes2=\
RandomForest_fusion_Cross (lFilesPred, sFileLabels, iEstimators, nudge, rLR, iMD,iRS)

print('Global: ')
calculateMSE(vPredCross, vTrueCross)

print('Acoustic model: ')
calculateMSE(vRes1, vLabels)

print('NLP model: ')
calculateMSE(30*vRes2, vLabels)

mPredictions and vLabels
(107, 2)
107
Global: 
MSE:6.356175549296717
Acoustic model: 
MSE:6.296548828856566
NLP model: 
MSE:6.521458232233364


In [140]:
def orderFiles(lDicts,dLabels, iNumFiles):
    vPredIter=np.zeros((1,iNumFiles))
    vLabels=[] #true label
    bEnter=1
    for k in lDicts[0]: #first dictionary will be the lead
        if k!='measurement_id':
            #print(k)
            #print(dID[k])
            if dLabels[k]!='NA':
                vLabels=np.append(vLabels,float(np.asarray(dLabels[k]))) #true labels
                vPredIter[0,0]=float(np.asarray(lDicts[0][k])) #first predicted value
                #print(vPredIter)
                for j in range(1, iNumFiles):
                    fPred=lDicts[j].get(k)
                if fPred:
                    vPredIter[0,j]=float(np.asarray(fPred))
                else:
                    print(['Unkwnown key:' + k])
                    vPredIter[0,j]=float(np.asarray(lDicts[0][k]))
                    
            if bEnter==1:
                mPredictions=vPredIter # mPredictions initialization.
                bEnter=0
            else:
                mPredictions=np.append(mPredictions,vPredIter,axis=0)
         
    return mPredictions, vLabels

In [156]:
def RandomForest_fusion_Cross_perfold (lFilesTrai,lFilesPred, sFileLabels, iEstimators, rLR, iMD,iRS):
    
    # Read file labels (true labels)
    
    with open(sFileLabels, mode='r') as infile:
        reader = csv.reader(infile)
        dLabels= {rows[0]:rows[1] for rows in reader} #participantID:label
    #print('thisis dLabels-------')
    #print(dLabels)
    
    # Training-testing data
    iNumFilesTest=len(lFilesPred)
    iNumFilesTrai=len(lFilesTrai)
#mPredictions=[] #np.zeros((1,iNumFiles))
#mPredictions=np.asarray(mPredictions)
    lDicts=[] 
    lDictsTrai=[]

    for sFilePred in lFilesPred:
        with open(sFilePred, mode='r') as infile:
            reader = csv.reader(infile)
            dPred = {rows[0]:rows[1] for rows in reader} #Prediction from the different classifiers (acoustic, w-embed)
        lDicts.append(dPred)
        
    for sFileTrai in lFilesTrai:
        with open(sFileTrai, mode='r') as infile:
            reader = csv.reader(infile)
            dTrai = {rows[0]:rows[1] for rows in reader} #Prediction from the different classifiers (acoustic, w-embed)
        lDictsTrai.append(dTrai)
   
    #creation of the matrix containing prediction from all classifiers

    mTrain, vLTrai=orderFiles(lDictsTrai, dLabels, iNumFilesTrai)
    mTest, vLTest=orderFiles(lDicts, dLabels, iNumFilesTest)
    
    print('mTrain and vLabels')
    print(mTrain.shape)
    print(len(vLTrai))
    
    vRes1=np.transpose(mTest[:,[0]])
    vRes2=np.transpose(mTest[:,[1]])
 
    # Random forest training - regression
    

    clf=GradientBoostingRegressor(n_estimators=iEstimators, learning_rate=rLR, max_depth=iMD,\
                                  random_state=iRS, loss='ls').fit(mTrain, vLTrai)
    
    vNewPred=clf.predict(mTest)

   
    # we will include the testing data here
    return (np.array(vNewPred), np.array(vLTest), np.array(vRes1), np.array(vRes2))
#print(mPredictions)

In [None]:
# Fusion per fold DETECTION

In [167]:
# Using xvector-finetuned acoustic features +BERT - Detection

sFileTrai1='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/AcousticScores_crossvalperfoldTrainfold_'
sFileTest1='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/AcousticScores_crossvalperfoldTestfold_'

sFileTrai2='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/BertScores_crossvalperfoldTrainfold_'
sFileTest2='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/BertScores_crossvalperfoldDevfold_'

sFileLabels='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/TrueLabels.csv'

vGlobalLabels=[]
vGlobalPreds=[]

vGlobalPreds1=[]

vGlobalPreds2=[]

for fold in range(1,11):
    lFilesTrai=[sFileTrai1+str(fold)+'.csv',sFileTrai2+str(fold)+'.csv']
    lFilesTest=[sFileTest1+str(fold)+'.csv',sFileTest2+str(fold)+'.csv']

    iEstimators=900
    rLR=0.1
    iMD=1
    iRS=0

    vPred, vLabels, vRes1, vRes2=\
    RandomForest_fusion_Cross_perfold (lFilesTrai,lFilesTest, sFileLabels, iEstimators, rLR, iMD,iRS)
    
    vGlobalPreds.extend(vPred)
    vGlobalLabels.extend(vLabels)
    
    vGlobalPreds1.extend(np.array(vRes1))
    vGlobalPreds2.extend(np.array(vRes2))
    
    print('Fold: '+str(fold))
    print('Global: ')
    calculateaccu(vPred, vLabels)

    print('Acoustic model: ')
    calculateaccu(vRes1, vLabels)

    print('NLP model: ')
    calculateaccu(vRes2, vLabels)
    
    
print('GLOBAL RESULTS:')
print('Global: ')
calculateaccu(np.array(vGlobalPreds), np.array(vGlobalLabels))

print('Acoustic model: ')
calculateaccu(np.hstack(vGlobalPreds1), np.array(vGlobalLabels))

print('NLP model: ')
calculateaccu(np.hstack(vGlobalPreds2), np.array(vGlobalLabels))

mTrain and vLabels
(86, 2)
86
Fold: 1
Global: 
Accuracy: 0.4545454545454546
Acoustic model: 
Accuracy: 0.4545454545454546
NLP model: 
Accuracy: 0.6363636363636364
mTrain and vLabels
(87, 2)
87
Fold: 2
Global: 
Accuracy: 0.6363636363636364
Acoustic model: 
Accuracy: 0.6363636363636364
NLP model: 
Accuracy: 0.8181818181818181
mTrain and vLabels
(87, 2)
87
Fold: 3
Global: 
Accuracy: 0.8
Acoustic model: 
Accuracy: 0.6
NLP model: 
Accuracy: 0.8
mTrain and vLabels
(86, 2)
86
Fold: 4
Global: 
Accuracy: 0.7272727272727273
Acoustic model: 
Accuracy: 0.6363636363636364
NLP model: 
Accuracy: 0.9090909090909091
mTrain and vLabels
(86, 2)
86
Fold: 5
Global: 
Accuracy: 0.8181818181818181
Acoustic model: 
Accuracy: 0.6363636363636364
NLP model: 
Accuracy: 0.8181818181818181
mTrain and vLabels
(86, 2)
86
Fold: 6
Global: 
Accuracy: 0.7272727272727273
Acoustic model: 
Accuracy: 0.8181818181818181
NLP model: 
Accuracy: 0.6363636363636364
mTrain and vLabels
(86, 2)
86
Fold: 7
Global: 
Accuracy: 0.63636363

In [None]:
# AVERAGE FUSION

In [108]:
# Using Raghu's acoustic features - Detection
sFilePred1='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/AcousticScores_crossval.csv'
sFilePred2='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/BertScores_crossval.csv'
sFileLabels='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/TrueLabels.csv'

lFilesPred=[sFilePred1,sFilePred2];

nudge=1
iEstimators=900
rLR=0.1
iMD=1
iRS=0

vPredCross, vTrueCross, vLabels, vRes1, vRes2=\
average_fusion_cross(lFilesPred, sFileLabels, nudge,0)

print('Global: ')
calculateaccu(vPredCross, vTrueCross)
print('Acoustic model: ')
calculateaccu(vRes1, vLabels)

print('NLP model: ')
calculateaccu(vRes2, vLabels)


mPredictions and vLabels
(108, 2)
108
Global: 
Accuracy: 0.7452830188679245
Acoustic model: 
Accuracy: 0.6388888888888888
NLP model: 
Accuracy: 0.7777777777777778


In [107]:
# Using xVectors-PLDA - Detection
sFilePred1='/export/c08/lmorove1/kaldi/egs/xVecAD/v1/exp/3ann/resBestxVecFold_all/kFoldsResults_detection.csv'
sFilePred2='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/BertScores_crossval.csv'
sFileLabels='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/TrueLabels.csv'

lFilesPred=[sFilePred1,sFilePred2];

nudge=1
iEstimators=900
rLR=0.1
iMD=1
iRS=0

vPredCross, vTrueCross, vLabels, vRes1, vRes2=\
average_fusion_cross(lFilesPred, sFileLabels, nudge,1)

print('Global: ')
calculateaccu(vPredCross, vTrueCross)
print('Acoustic model: ')
calculateaccu(vRes1, vLabels)

print('NLP model: ')
calculateaccu(vRes2, vLabels)


mPredictions and vLabels
(108, 2)
108
Global: 
Accuracy: 0.7169811320754718
Acoustic model: 
Accuracy: 0.7314814814814814
NLP model: 
Accuracy: 0.7777777777777778


In [128]:
# Using xVectors-PLDA MMSE
sFilePred1='/export/c08/lmorove1/kaldi/egs/xVecAD/v1/exp/3ann/resBestxVecFold_all/MMSE_crossValidation.csv'
sFilePred2='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/BertMMSE_crossval.csv'
sFileLabels='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/TrueMSE.csv'

lFilesPred=[sFilePred1,sFilePred2];

nudge=1
iEstimators=900
rLR=0.1
iMD=1
iRS=0

vPredCross, vTrueCross, vLabels, vRes1, vRes2=\
average_fusion_cross_2(lFilesPred, sFileLabels, nudge,0) # This function multiplies sFIlePred2 x 30

print('Global: ')
calculateMSE(vPredCross, vTrueCross)

print('Acoustic model: ')
calculateMSE(vRes1, vLabels)

print('NLP model: ')
calculateMSE(vRes2, vLabels)

mPredictions and vLabels
(107, 2)
107
Global: 
MSE:5.930824005894585
Acoustic model: 
MSE:6.296548828856566
NLP model: 
MSE:6.521458232233364


In [215]:
#Submission round 1. 
# CLASS PREDICTION
# Average of BERT
sFilePred='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/BertScores_round1_'
lFilesPred=[]
for fold in range(1,11):
    lFilesPred.append(sFilePred+str(fold)+'.csv')
    
dest_dir='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/'
fileName='Round1Detection'
vPred, lID= average_fusion (lFilesPred, dest_dir, fileName, 1)


Number of analyzed file inputs: 10
mPredictions and vID
(48, 10)
48
[[0.72559565 0.28560138 0.06513522 0.1651601  0.07015276 0.03334882
  0.03571316 0.04052589 0.02987922 0.08371879]
 [0.72559565 0.28560138 0.06513522 0.1651601  0.07015276 0.03334882
  0.03571316 0.04052589 0.02987922 0.08371879]
 [0.79538405 0.8518368  0.94358134 0.93523407 0.89284587 0.86066794
  0.88467431 0.87550122 0.74595386 0.77696401]
 [0.79469365 0.86883652 0.57957286 0.68774277 0.13125151 0.03630467
  0.70297796 0.20872954 0.58943719 0.90434134]
 [0.80023128 0.86913019 0.94007659 0.93650758 0.89865911 0.0793495
  0.88406891 0.87186927 0.88059449 0.24052742]
 [0.79091942 0.87220013 0.94781786 0.9677155  0.89762121 0.28887475
  0.88238418 0.8705864  0.88658065 0.63545769]
 [0.78026366 0.37817809 0.79832673 0.73456442 0.13907486 0.03252541
  0.03684756 0.69943172 0.05709572 0.27978647]
 [0.77085972 0.82998526 0.71294636 0.94995075 0.80935282 0.03326242
  0.80169404 0.85270995 0.29835427 0.87070936]
 [0.79870576 

In [216]:
# CLASS PREDICTION
# Average of BERT detection -round 1 into CSV
sFilePred='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/BertScores_round1_'
lFilesPred=[]
for fold in range(1,11):
    lFilesPred.append(sFilePred+str(fold)+'.csv')
    
dest_dir='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/'
fileName='Detection_Bert1average'
vPred, lID= average_fusion_tocsv (lFilesPred, dest_dir, fileName, 1)



Number of analyzed file inputs: 10
mPredictions and vID
(48, 10)
48
[[0.72559565 0.28560138 0.06513522 0.1651601  0.07015276 0.03334882
  0.03571316 0.04052589 0.02987922 0.08371879]
 [0.72559565 0.28560138 0.06513522 0.1651601  0.07015276 0.03334882
  0.03571316 0.04052589 0.02987922 0.08371879]
 [0.79538405 0.8518368  0.94358134 0.93523407 0.89284587 0.86066794
  0.88467431 0.87550122 0.74595386 0.77696401]
 [0.79469365 0.86883652 0.57957286 0.68774277 0.13125151 0.03630467
  0.70297796 0.20872954 0.58943719 0.90434134]
 [0.80023128 0.86913019 0.94007659 0.93650758 0.89865911 0.0793495
  0.88406891 0.87186927 0.88059449 0.24052742]
 [0.79091942 0.87220013 0.94781786 0.9677155  0.89762121 0.28887475
  0.88238418 0.8705864  0.88658065 0.63545769]
 [0.78026366 0.37817809 0.79832673 0.73456442 0.13907486 0.03252541
  0.03684756 0.69943172 0.05709572 0.27978647]
 [0.77085972 0.82998526 0.71294636 0.94995075 0.80935282 0.03326242
  0.80169404 0.85270995 0.29835427 0.87070936]
 [0.79870576 

In [217]:
# CLASS PREDICTION acoustic model - average
# Average of acoustic Model detection -round 4 into CSV
sFilePred='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/AcousticScores_round4_'
lFilesPred=[]
for fold in range(1,11):
    lFilesPred.append(sFilePred+str(fold)+'.csv')
    
dest_dir='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/'
fileName='Detection_Acoustic4average'
vPred, lID= average_fusion_tocsv (lFilesPred, dest_dir, fileName, 1)



Number of analyzed file inputs: 10
mPredictions and vID
(48, 10)
48
[[0.51763761 0.2578373  0.14571771 0.5064835  0.74108922 0.35053664
  0.7714172  0.76346982 0.06099356 0.11149926]
 [0.51763761 0.2578373  0.14571771 0.5064835  0.74108922 0.35053664
  0.7714172  0.76346982 0.06099356 0.11149926]
 [0.23065864 0.2212508  0.16180502 0.71250218 0.53240657 0.46810126
  0.35216439 0.40503079 0.08730643 0.30864367]
 [0.82566488 0.72241318 0.569121   0.69916546 0.67688084 0.63440645
  0.75774312 0.82172    0.44276404 0.70732516]
 [0.50534004 0.36173421 0.38202581 0.30435023 0.11345185 0.50515288
  0.36811358 0.54699355 0.12518969 0.4687393 ]
 [0.58118993 0.8056044  0.81595731 0.53918624 0.38087618 0.5194236
  0.22149722 0.78720903 0.39380825 0.58077335]
 [0.37688807 0.21372975 0.17327426 0.23326358 0.41138721 0.22827107
  0.26233089 0.41358587 0.03383163 0.10419046]
 [0.5976162  0.38408363 0.46803305 0.58773547 0.59004605 0.65957624
  0.56131685 0.77182859 0.23941948 0.54479343]
 [0.97237223 

In [229]:
#Submission round 1. 
# MMSE
# Average of BERT
sFilePred='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/BertMMSE_round1_'
lFilesPred=[]
for fold in range(1,11):
    lFilesPred.append(sFilePred+str(fold)+'.csv')
    
dest_dir='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/'
fileName='Round1MMSE'
vPred, lID= average_fusion (lFilesPred, dest_dir, fileName, 0)


Number of analyzed file inputs: 10
mPredictions and vID
(48, 10)
48
[[23.60571563 23.46631408 28.7753123  21.74768686 24.67330635 24.62428808
  23.05798531 22.08063483 23.8766849  22.74387538]
 [23.60571563 23.46631408 28.7753123  21.74768686 24.67330635 24.62428808
  23.05798531 22.08063483 23.8766849  22.74387538]
 [24.06462371 24.09753263 28.55808198 22.72558987 23.79175186 24.84918237
  21.5843457  22.6585865  22.61762023 23.11316013]
 [23.17144454 23.04545581 25.82623422 18.72650564 20.50527871 22.04661369
  20.73980749 19.14469063 18.96748245 22.25191891]
 [23.3160764  23.20010662 27.01225877 19.48866963 22.45022178 23.60229671
  22.20326543 20.05918801 19.65840518 22.43344903]
 [23.48778427 23.3510685  26.58626139 20.39819062 22.10777342 23.43059242
  21.48945451 20.85258365 20.53010523 22.52137721]
 [24.10005033 24.00746047 29.35370564 22.0120883  24.57393587 24.64343727
  22.39683867 22.18738317 23.73419881 22.83809066]
 [23.66023242 23.44177008 28.80832851 21.55219316 24.2657

In [230]:
# Submission Round 3
# MMSE average of BERT and x-vectors PLDA
sFilePred1='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/finalMMSE-xvectorsplda.csv'
sFilePred2='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/BertMMSE_round1_'
lFilesPred=[]
for fold in range(1,11):
    lFilesPred.append(sFilePred2+str(fold)+'.csv')
    lFilesPred.append(sFilePred1)
    
dest_dir='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/'
fileName='Round3MMSE'
vPred, lID= average_fusion(lFilesPred, dest_dir, fileName, 0)
    

Number of analyzed file inputs: 20
mPredictions and vID
(48, 20)
48
[[23.60571563 20.97770694 23.46631408 20.97770694 28.7753123  20.97770694
  21.74768686 20.97770694 24.67330635 20.97770694 24.62428808 20.97770694
  23.05798531 20.97770694 22.08063483 20.97770694 23.8766849  20.97770694
  22.74387538 20.97770694]
 [23.60571563 20.97770694 23.46631408 20.97770694 28.7753123  20.97770694
  21.74768686 20.97770694 24.67330635 20.97770694 24.62428808 20.97770694
  23.05798531 20.97770694 22.08063483 20.97770694 23.8766849  20.97770694
  22.74387538 20.97770694]
 [24.06462371 33.30118939 24.09753263 33.30118939 28.55808198 33.30118939
  22.72558987 33.30118939 23.79175186 33.30118939 24.84918237 33.30118939
  21.5843457  33.30118939 22.6585865  33.30118939 22.61762023 33.30118939
  23.11316013 33.30118939]
 [23.17144454 21.58944575 23.04545581 21.58944575 25.82623422 21.58944575
  18.72650564 21.58944575 20.50527871 21.58944575 22.04661369 21.58944575
  20.73980749 21.58944575 19.14469063

In [232]:
# Submission Round 4
# MMSE average of BERT and x-vectors PLDA
sFilePred1='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/finalMMSE-xvectorsplda.csv'
sFilePred2='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/BertMMSE_round1_'
lFilesPred=[]
for fold in range(3,11):
    lFilesPred.append(sFilePred2+str(fold)+'.csv')
    
lFilesPred.append(sFilePred1)

dest_dir='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/'
fileName='Round4MMSE'
vPred, lID= average_fusion (lFilesPred, dest_dir, fileName, 0)
    

Number of analyzed file inputs: 9
mPredictions and vID
(48, 9)
48
[[28.7753123  21.74768686 24.67330635 24.62428808 23.05798531 22.08063483
  23.8766849  22.74387538 20.97770694]
 [28.7753123  21.74768686 24.67330635 24.62428808 23.05798531 22.08063483
  23.8766849  22.74387538 20.97770694]
 [28.55808198 22.72558987 23.79175186 24.84918237 21.5843457  22.6585865
  22.61762023 23.11316013 33.30118939]
 [25.82623422 18.72650564 20.50527871 22.04661369 20.73980749 19.14469063
  18.96748245 22.25191891 21.58944575]
 [27.01225877 19.48866963 22.45022178 23.60229671 22.20326543 20.05918801
  19.65840518 22.43344903 25.04373755]
 [26.58626139 20.39819062 22.10777342 23.43059242 21.48945451 20.85258365
  20.53010523 22.52137721 24.46814765]
 [29.35370564 22.0120883  24.57393587 24.64343727 22.39683867 22.18738317
  23.73419881 22.83809066 27.9617009 ]
 [28.80832851 21.55219316 24.26579297 24.27590847 22.85587549 21.66037917
  22.89114296 22.66313732 27.09034778]
 [23.14789474 17.79807329 19.36

In [233]:
def orderFiles_final(lDicts, iNumFiles):
    vPredIter=np.zeros((1,iNumFiles))
    vID=[] #true label
    bEnter=1
    for k in lDicts[0]: #first dictionary will be the lead
        if k!='measurement_id':
            #print(k)
            #print(dID[k])

            vPredIter[0,0]=float(np.asarray(lDicts[0][k])) #first predicted value
            #print(vPredIter)
            for j in range(1, iNumFiles):
                fPred=lDicts[j].get(k)
            if fPred:
                vID.append(k)
                vPredIter[0,j]=float(np.asarray(fPred))
            else:
                print(['Unkwnown key:' + k])
                vPredIter[0,j]=float(np.asarray(lDicts[0][k]))

            if bEnter==1:
                mPredictions=vPredIter # mPredictions initialization.
                bEnter=0
            else:
                mPredictions=np.append(mPredictions,vPredIter,axis=0)
         
    return mPredictions, vID

In [234]:
def RandomForest_fusion_FINAL (lFilesTrai,lFilesPred, sFileLabels, iEstimators, rLR, iMD,iRS, dest_dir,sNameOut):
    
    # Read file labels (true labels)
    
    with open(sFileLabels, mode='r') as infile:
        reader = csv.reader(infile)
        dLabels= {rows[0]:rows[1] for rows in reader} #participantID:label
    #print('thisis dLabels-------')
    #print(dLabels)
    
    # Training-testing data
    iNumFilesTest=len(lFilesPred)
    iNumFilesTrai=len(lFilesTrai)
#mPredictions=[] #np.zeros((1,iNumFiles))
#mPredictions=np.asarray(mPredictions)
    lDicts=[] 
    lDictsTrai=[]

    for sFilePred in lFilesPred:
        with open(sFilePred, mode='r') as infile:
            reader = csv.reader(infile)
            dPred = {rows[0]:rows[1] for rows in reader} #Prediction from the different classifiers (acoustic, w-embed)
        lDicts.append(dPred)
        
    for sFileTrai in lFilesTrai:
        with open(sFileTrai, mode='r') as infile:
            reader = csv.reader(infile)
            dTrai = {rows[0]:rows[1] for rows in reader} #Prediction from the different classifiers (acoustic, w-embed)
        lDictsTrai.append(dTrai)
   
    #creation of the matrix containing prediction from all classifiers

    mTrain, vLTrai=orderFiles(lDictsTrai, dLabels, iNumFilesTrai)
    mTest, vIDTest=orderFiles_final(lDicts, iNumFilesTest)
    
    print('mTrain and vLabels')
    print(mTrain.shape)
    print(len(vLTrai))
    
    vRes1=np.transpose(mTest[:,[0]])
    vRes2=np.transpose(mTest[:,[1]])
 
    # Random forest training - regression
    

    clf=GradientBoostingRegressor(n_estimators=iEstimators, learning_rate=rLR, max_depth=iMD,\
                                  random_state=iRS, loss='ls').fit(mTrain, vLTrai)
    
    vNewPred=clf.predict(mTest)
    
    vIDTest, vNewPred=zip(*sorted(zip(vIDTest, vNewPred)))
          
   
    df = pd.DataFrame({'measurement_id': vIDTest, 'prediction':np.round(np.array(vNewPred))})
    np.savetxt(dest_dir+sNameOut+'.txt', df.values, fmt='%s', delimiter=' ; ')
    print(dest_dir+sNameOut+'.txt')
    # we will include the testing data here
    return (np.round(np.array(vNewPred)), vIDTest, np.array(vRes1), np.array(vRes2))
       
    
#print(mPredictions)

In [251]:
# Submission Round 3, DETECTION: fusion of xvectorsplda + Bert
sFilePred1='/export/c08/lmorove1/kaldi/egs/xVecAD/v1/exp/3ann/resBestxVecFold_all/kFoldsResults_detection.csv'
sFilePred2='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/BertScores_crossval.csv'
sFileLabels='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/TrueLabels.csv'

sFileFinal1='/export/c08/lmorove1/kaldi/egs/xVecAD/v1/exp/3ann/resBestxVecFold_all/FinalxvectorsDetection.csv'

sFileFinal2='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/Detection_Bert1average.csv'

dest_dir='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/'
sNameOut='Round3Detection'
lFilesTrai=[sFilePred1,sFilePred2];

lFilesFinal=[sFileFinal1,sFileFinal2]

nudge=1
iEstimators=900
rLR=0.1
iMD=1
iRS=0

vPred, vID, vRes1, vRes2=\
RandomForest_fusion_FINAL (lFilesTrai,lFilesFinal,sFileLabels, iEstimators, rLR, iMD,iRS,dest_dir, sNameOut)



mTrain and vLabels
(108, 2)
108
/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/Round3Detection.txt


In [253]:
# Submission Round 4, DETECTION: fusion of xvectorfine tuned + Bert
sFilePred1='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/AcousticScores_crossval.csv'
sFilePred2='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/BertScores_crossval.csv'
sFileLabels='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/TrueLabels.csv'

sFileFinal1='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/Detection_Acoustic4average.csv'

sFileFinal2='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/Detection_Bert1average.csv'

dest_dir='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/'
sNameOut='Round4Detection'
lFilesTrai=[sFilePred1,sFilePred2];

lFilesFinal=[sFileFinal1,sFileFinal2]

nudge=1
iEstimators=900
rLR=0.1
iMD=1
iRS=0

vPred, vID, vRes1, vRes2=\
RandomForest_fusion_FINAL (lFilesTrai,lFilesFinal,sFileLabels, iEstimators, rLR, iMD,iRS,dest_dir, sNameOut)


mTrain and vLabels
(108, 2)
108
/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/Round4Detection.txt


In [250]:
sFile='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/Old_sent1/Detection_'
rang=range(1,6)
dest_dir='/export/c08/lmorove1/kaldi/egs/beatPDivec/jupyternotebooks/Adress/DataResults/newDetection/'

for i in rang:
    
    sInfile=sFile+str(i)+'.txt'
    filename='test_results-classif-'+str(i)
    lID=[]
    vLabel=[]

    with open(sInfile,'r') as foo:
        for each_line in foo:
            print(each_line)
            lines = each_line.split(";")

            lID.append(lines[0].strip())
            #print(lines[0])
            #print(lines[2].strip())
            if lines[0]!='ID   ':
                newLabel=str(1-float(lines[1].strip()))
            else:
                newLabel=lines[1].strip()
            #print(newLabel)
            vLabel.append(newLabel)

    lID, vPrediction=zip(*sorted(zip(lID, vLabel)))
    df = pd.DataFrame({'ID': lID, 'Prediction':vLabel})
    np.savetxt(dest_dir+filename+'.txt', df.values, fmt='%s', delimiter=' ; ')


ID   ; Prediction

S160 ; 1.0

S161 ; 1.0

S162 ; 1.0

S163 ; 1.0

S164 ; 0.0

S165 ; 0.0

S166 ; 1.0

S167 ; 0.0

S168 ; 1.0

S169 ; 1.0

S170 ; 0.0

S171 ; 0.0

S172 ; 1.0

S173 ; 0.0

S174 ; 1.0

S175 ; 0.0

S176 ; 0.0

S177 ; 1.0

S178 ; 1.0

S179 ; 0.0

S180 ; 1.0

S181 ; 0.0

S182 ; 0.0

S183 ; 1.0

S184 ; 0.0

S185 ; 1.0

S186 ; 1.0

S187 ; 0.0

S188 ; 0.0

S189 ; 0.0

S190 ; 0.0

S191 ; 0.0

S192 ; 0.0

S193 ; 0.0

S194 ; 0.0

S195 ; 0.0

S196 ; 1.0

S197 ; 0.0

S198 ; 0.0

S199 ; 1.0

S200 ; 0.0

S201 ; 0.0

S202 ; 0.0

S203 ; 0.0

S204 ; 1.0

S205 ; 0.0

S206 ; 1.0

S207 ; 0.0

ID   ; Prediction

S160 ; 1.0

S161 ; 1.0

S162 ; 1.0

S163 ; 0.0

S164 ; 0.0

S165 ; 0.0

S166 ; 1.0

S167 ; 1.0

S168 ; 1.0

S169 ; 0.0

S170 ; 1.0

S171 ; 0.0

S172 ; 1.0

S173 ; 0.0

S174 ; 1.0

S175 ; 1.0

S176 ; 1.0

S177 ; 0.0

S178 ; 0.0

S179 ; 0.0

S180 ; 0.0

S181 ; 0.0

S182 ; 0.0

S183 ; 0.0

S184 ; 0.0

S185 ; 0.0

S186 ; 1.0

S187 ; 0.0

S188 ; 0.0

S189 ; 0.0

S190 ; 0.0

S191 ; 0.0

S1