# Voting


In [23]:
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import f1_score, make_scorer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

In [3]:
def loadCSV(pathSamples, pathMatrix):
    df_loaded = pd.read_table(pathMatrix, sep=',')
    data = pd.read_table(pathSamples, sep=',')
    clin_trial_values = df_loaded.values
    
    Y = data['Eligible']
    Y = Y.astype(int)
    X = clin_trial_values[:, :]
    return X, Y;

In [4]:
X, Y = loadCSV("../../Dataset/10k_1Col_NoCarEsp_LSA.csv", "../../Tables/docsTopicsLSA1200.csv") #Cargar SCV

In [5]:
LR = LogisticRegression(C=.5, solver='saga', max_iter = 200)
nb = GaussianNB()
RF = RandomForestClassifier(n_estimators=100, max_features =100, n_jobs = -1)

In [6]:
eclf_hard = VotingClassifier(estimators=[('lr', LR), ('rf', nb), ('gnb', RF)], voting='hard')
eclf_soft = VotingClassifier(estimators=[('lr', LR), ('rf', nb), ('gnb', RF)], voting='soft')

In [29]:
cv = ShuffleSplit(n_splits=4, test_size=0.2, random_state=0)

start_hard = time.time()
## Hard
train_sizes_hard, train_scores_hard, test_scores_hard, fit_times_hard, _ = \
    learning_curve(eclf_hard, X, Y, cv=cv, n_jobs=-1,
                   train_sizes=[.2],
                   return_times=True)
end_hard = time.time()

start_soft = time.time()
## Soft
train_sizes_soft, train_scores_soft, test_scores_soft, fit_times_soft, _ = \
    learning_curve(eclf_soft, X, Y, cv=cv, n_jobs=-1,
                   train_sizes=[.2],
                   return_times=True)
end_soft = time.time()

print('Hard train accuracy: ', np.mean(train_scores_hard), ' +/- ', np.std(train_scores_hard), ". Time: ", end_hard - start_hard)
print('Hard test accuracy: ', np.mean(test_scores_hard), ' +/- ', np.std(test_scores_hard), ". Time: ", end_hard - start_hard)

print('Soft train accuracy: ', np.mean(train_scores_soft), ' +/- ', np.std(train_scores_soft), ". Time: ", end_soft - start_soft)
print('Soft test accuracy: ', np.mean(test_scores_soft), ' +/- ', np.std(test_scores_soft), ". Time: ", end_soft - start_soft)

Hard train accuracy:  0.9346875000000001  +/-  0.00396518048139048 . Time:  18.442303895950317
Hard test accuracy:  0.708625  +/-  0.008897857888278515 . Time:  18.442303895950317
Soft train accuracy:  0.8293750000000001  +/-  0.013571281719130317 . Time:  19.937783241271973
Soft test accuracy:  0.651125  +/-  0.01529859062136117 . Time:  19.937783241271973


In [30]:
scorer = make_scorer(lambda y_true, y_pred: f1_score(
    y_true, y_pred, 
    labels=None, 
    pos_label=0, 
    average='binary', 
    sample_weight=None))

cv = ShuffleSplit(n_splits=4, test_size=0.2, random_state=0)

start_hard = time.time()
## Hard
train_sizes_hard, train_scores_hard, test_scores_hard, fit_times_hard, _ = \
    learning_curve(eclf_hard, X, Y, cv=cv, n_jobs=-1,
                   train_sizes=[.2],
                   return_times=True, scoring=scorer)
end_hard = time.time()

start_soft = time.time()
## Soft 
train_sizes_soft, train_scores_soft, test_scores_soft, fit_times_soft, _ = \
    learning_curve(eclf_soft, X, Y, cv=cv, n_jobs=-1,
                   train_sizes=[.2],
                   return_times=True, scoring=scorer)
end_soft = time.time()

print('Hard train F1 score: ', np.mean(train_scores_hard), ' +/- ', np.std(train_scores_hard), ". Time: ", end_hard - start_hard)
print('Hard test F1 score: ', np.mean(test_scores_hard), ' +/- ', np.std(test_scores_hard), ". Time: ", end_hard - start_hard)

print('Soft train F1 score: ', np.mean(train_scores_soft), ' +/- ', np.std(train_scores_soft), ". Time: ", end_soft - start_soft)
print('Soft test F1 score: ', np.mean(test_scores_soft), ' +/- ', np.std(test_scores_soft), ". Time: ", end_soft - start_soft)

Hard train F1 score:  0.9369086346246075  +/-  0.003925959362210268 . Time:  20.114948272705078
Hard test F1 score:  0.7303542980511957  +/-  0.005675625132246841 . Time:  20.114948272705078
Soft train F1 score:  0.8416615037707937  +/-  0.011780081027623375 . Time:  18.36276936531067
Soft test F1 score:  0.6905909107148147  +/-  0.012149342708924123 . Time:  18.36276936531067
