In [1]:
import pandas as pd
from sklearn import preprocessing
from collections import defaultdict
import numpy as np
import time
import qgrid

In [2]:
def loadCSV(pathSamples, pathMatrix):
    df_loaded = pd.read_table(pathMatrix, sep=',')
    data = pd.read_table(pathSamples, sep=',')
    clin_trial_values = df_loaded.values
    columns = df_loaded.columns
    
    Y = data['Eligible']
    Y = Y.astype(int)
    X = clin_trial_values[:, :]
    return X,Y, columns

In [3]:
X, Y, columns = loadCSV("../../Dataset/10k_1Col_NoCarEsp_LSA.csv", "../../Tables/docsTopicsLSA1500.csv") #Cargar SCV

In [4]:
from sklearn.model_selection import train_test_split 
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=.2) # Modificar metodología de validación

In [5]:
import xgboost
from sklearn.metrics import accuracy_score

def xgBoost(X,Y,learning_rate=0.1,  colsample_bytree = 0.3, max_depth = 5, n_estimators = 20, reg_alpha = 10):
    
    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=.2)
    time_finish = time.time()
    # cargamos las 4 combinaciones de las compuertas XOR
    training_data = Xtrain

    # y estos son los resultados que se obtienen, en el mismo orden
    target_data = Ytrain
    
    model = xgboost.XGBClassifier(objective ='binary:logistic', colsample_bytree = colsample_bytree, learning_rate = learning_rate,
                    max_depth = max_depth, n_estimators = n_estimators, booster = 'gbtree', reg_alpha = reg_alpha)


    model.fit(training_data, target_data, eval_metric='logloss')
    
    preds = model.predict(Xtest)
    pred_train = model.predict(training_data)
    
    accuracyTrain = accuracy_score(target_data.values, pred_train)
    
    accuracyTest = accuracy_score(Ytest.values, preds)

    return model, str(time.time() - time_finish), accuracyTrain, accuracyTest

In [57]:
pd.options.mode.chained_assignment = None
import time
import qgrid

randn = np.random.randn
df_types = pd.DataFrame({
    'learning_rate' : pd.Series([0.2,0.2,0.15,0.3]), 
    'colsample_bytree' : pd.Series([0.4,0.8,0.6,0.3]), 
    'max_depth' : pd.Series([12,10,3,4]),
    'reg_alpha' : pd.Series([50,100,50,0]),
    'n_estimators' : pd.Series([40,40,10,20])})
df_types["Accuracy train"] = ""
df_types["Accuracy test"] = ""
df_types["Tiempo de ejecución"] = ""
df_types.set_index(['learning_rate', 'colsample_bytree', 'max_depth', 'n_estimators', 'reg_alpha'], inplace=True)

for learning_rate, colsample_bytree, max_depth, n_estimators, reg_alpha in df_types.index:
    modelo, time_finish, accuracyTrain, accuracyTest = xgBoost(X,Y,learning_rate=learning_rate,colsample_bytree = colsample_bytree,
                                                              max_depth = max_depth, n_estimators = n_estimators, reg_alpha = reg_alpha)   
    
    
    df_types["Accuracy test"][learning_rate, colsample_bytree, max_depth, n_estimators, reg_alpha] = (accuracyTest * 100.0)
    df_types["Tiempo de ejecución"][learning_rate, colsample_bytree, max_depth,n_estimators, reg_alpha] = time_finish
    df_types["Accuracy train"][learning_rate, colsample_bytree, max_depth, n_estimators, reg_alpha] = (accuracyTrain * 100.0)
    
qgrid_widget = qgrid.show_grid(df_types, show_toolbar=False)
qgrid_widget.get_changed_df()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Accuracy train,Accuracy test,Tiempo de ejecución
learning_rate,colsample_bytree,max_depth,n_estimators,reg_alpha,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.2,0.4,12,40,50,89.25,75.25,7.038994550704956
0.2,0.8,10,40,100,80.825,71.35,8.253281354904175
0.15,0.6,3,10,50,68.425,67.45,1.327953815460205
0.3,0.3,4,20,0,82.85,71.5,1.5586626529693604


In [6]:
def grid_search(parameters):
    xgb = xgboost.XGBClassifier(objective ='binary:logistic', booster = 'gbtree')
    clf = GridSearchCV(xgb, parameters, scoring='f1', n_jobs=-1, return_train_score=True, verbose=1)
    clf.fit(X,Y)
    
    return clf

In [7]:
def plot_grid_search(clf, parameters):
    # Get Test Scores Mean and std for each grid search
    scores_mean = clf.cv_results_['mean_test_score']
    scores_mean = np.array(scores_mean).reshape(len(parameters['metric']),len(parameters['K']))

    scores_sd = clf.cv_results_['std_test_score']
    scores_sd = np.array(scores_sd).reshape(len(parameters['metric']),len(parameters['K']))

    # Plot Grid search scores
    _, ax = plt.subplots(1,1, figsize=(20, 10))

    # Param1 is the X-axis, Param 2 is represented as a different curve (color line)
    for idx, val in enumerate(parameters['metric']):
        ax.plot(parameters['K'], scores_mean[idx,:], '-o', label= 'metric' + ': ' + str(val))

    ax.set_title("Grid Search Scores", fontsize=20, fontweight='bold')
    ax.set_xlabel('K', fontsize=16)
    ax.set_ylabel('CV Average Score', fontsize=16)
    ax.legend(loc="best", fontsize=15)
    ax.grid('on')

In [15]:
def saveModel(clf, name):
    path = "../../Models/" + name + ".pkl"
    joblib.dump(clf, path) 

In [10]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'learning_rate' : [0.01,0.2,0.4,0.9], 
    'colsample_bytree': [0.01,0.3,0.4,0.6],
    'max_depth': [5,9,12],
    'reg_alpha': [0,50,100],
    'n_estimators': [10,25,40,50]
}


clf = grid_search(parameters)
clf.cv_results_ 

Fitting 5 folds for each of 576 candidates, totalling 2880 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   51.5s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed: 18.9min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed: 39.0min
[Parallel(n_jobs=-1)]: Done 2426 tasks      | elapsed: 67.9min
[Parallel(n_jobs=-1)]: Done 2880 out of 2880 | elapsed: 92.4min finished


{'mean_fit_time': array([  1.97065387,   1.96031594,   1.79910722,   2.58922844,
          2.57011857,   2.32763562,   3.20142446,   3.12878194,
          2.91485991,   3.49475207,   3.45403371,   3.3169765 ,
          2.03805785,   1.87535238,   1.93069558,   2.75424085,
          2.62703142,   2.43460207,   3.58398829,   3.39394627,
          3.18785815,   4.2417006 ,   3.92867861,   3.63931856,
          2.00514731,   1.95610609,   1.81447716,   2.92851944,
          2.75983448,   2.51359286,   3.97510695,   3.68075185,
          3.40618234,   4.81743469,   4.26180191,   3.96272917,
          1.66512527,   1.77743883,   1.75094123,   2.37503247,
          2.42062736,   2.29611526,   3.09083991,   3.03744369,
          2.87955971,   3.5104918 ,   3.44348211,   3.30484719,
          1.90485282,   1.84755673,   1.8232955 ,   2.75135841,
          2.54276538,   2.42592487,   3.52051892,   3.37840843,
          3.136905  ,   4.17883086,   3.85129509,   3.53527613,
          1.95562968,  

In [12]:
clf.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.2, max_delta_step=0, max_depth=12,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=50, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [13]:
clf.best_score_

0.759400158071584

In [16]:
import joblib

saveModel(clf, "XGB_LSA_1500") 