# Voting


In [2]:
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import f1_score, make_scorer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

In [3]:
def loadCSV(pathSamples, pathMatrix):
    df_loaded = pd.read_table(pathMatrix, sep=',')
    data = pd.read_table(pathSamples, sep=',')
    clin_trial_values = df_loaded.values
    
    Y = data['Eligible']
    Y = Y.astype(int)
    X = clin_trial_values[:, :]
    return X, Y;

In [4]:
X, Y = loadCSV("../../Dataset/10k_1Col_NoCarEsp_LSA.csv", "../../Tables/docsTopicsLSA1200.csv") #Cargar SCV

In [6]:
LR = LogisticRegression(C=.5, solver='saga', max_iter = 200)
nb = GaussianNB()
RF = RandomForestClassifier(n_estimators=100, max_features =100, n_jobs = -1)

In [7]:
eclf_hard = VotingClassifier(estimators=[('lr', LR), ('gnb', RF)], voting='hard')
eclf_soft = VotingClassifier(estimators=[('lr', LR), ('gnb', RF)], voting='soft')

In [8]:
cv = ShuffleSplit(n_splits=4, test_size=0.2, random_state=0)

start_hard = time.time()
## Hard
train_sizes_hard, train_scores_hard, test_scores_hard, fit_times_hard, _ = \
    learning_curve(eclf_hard, X, Y, cv=cv, n_jobs=-1,
                   train_sizes=[.2],
                   return_times=True)
end_hard = time.time()

start_soft = time.time()
## Soft
train_sizes_soft, train_scores_soft, test_scores_soft, fit_times_soft, _ = \
    learning_curve(eclf_soft, X, Y, cv=cv, n_jobs=-1,
                   train_sizes=[.2],
                   return_times=True)
end_soft = time.time()

print('Hard train accuracy: ', np.mean(train_scores_hard), ' +/- ', np.std(train_scores_hard), ". Time: ", end_hard - start_hard)
print('Hard test accuracy: ', np.mean(test_scores_hard), ' +/- ', np.std(test_scores_hard), ". Time: ", end_hard - start_hard)

print('Soft train accuracy: ', np.mean(train_scores_soft), ' +/- ', np.std(train_scores_soft), ". Time: ", end_soft - start_soft)
print('Soft test accuracy: ', np.mean(test_scores_soft), ' +/- ', np.std(test_scores_soft), ". Time: ", end_soft - start_soft)

Hard train accuracy:  0.94859375  +/-  0.0042705157402238726 . Time:  41.158541440963745
Hard test accuracy:  0.709125  +/-  0.004668712349245789 . Time:  41.158541440963745
Soft train accuracy:  0.99375  +/-  0.0010825317547305251 . Time:  41.68452286720276
Soft test accuracy:  0.745625  +/-  0.0044211847959568385 . Time:  41.68452286720276


In [9]:
scorer = make_scorer(lambda y_true, y_pred: f1_score(
    y_true, y_pred, 
    labels=None, 
    pos_label=0, 
    average='binary', 
    sample_weight=None))

cv = ShuffleSplit(n_splits=4, test_size=0.2, random_state=0)

start_hard = time.time()
## Hard
train_sizes_hard, train_scores_hard, test_scores_hard, fit_times_hard, _ = \
    learning_curve(eclf_hard, X, Y, cv=cv, n_jobs=-1,
                   train_sizes=[.2],
                   return_times=True, scoring=scorer)
end_hard = time.time()

start_soft = time.time()
## Soft 
train_sizes_soft, train_scores_soft, test_scores_soft, fit_times_soft, _ = \
    learning_curve(eclf_soft, X, Y, cv=cv, n_jobs=-1,
                   train_sizes=[.2],
                   return_times=True, scoring=scorer)
end_soft = time.time()

print('Hard train F1 score: ', np.mean(train_scores_hard), ' +/- ', np.std(train_scores_hard), ". Time: ", end_hard - start_hard)
print('Hard test F1 score: ', np.mean(test_scores_hard), ' +/- ', np.std(test_scores_hard), ". Time: ", end_hard - start_hard)

print('Soft train F1 score: ', np.mean(train_scores_soft), ' +/- ', np.std(train_scores_soft), ". Time: ", end_soft - start_soft)
print('Soft test F1 score: ', np.mean(test_scores_soft), ' +/- ', np.std(test_scores_soft), ". Time: ", end_soft - start_soft)

Hard train F1 score:  0.9515355722714898  +/-  0.004056959729707652 . Time:  30.08770751953125
Hard test F1 score:  0.7446185438742998  +/-  0.004990784551127898 . Time:  30.08770751953125
Soft train F1 score:  0.9933493409375109  +/-  0.0020148281838788555 . Time:  26.99445676803589
Soft test F1 score:  0.7517461164623338  +/-  0.004286740872537101 . Time:  26.99445676803589
