In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from data_prep import create_x_y_data

In [2]:
#import data
X_train, y_train, X_test, y_test, X, y = create_x_y_data()

print(X_train.shape)
print(X_test.shape)

(93, 1984)
(44, 1984)


In [3]:
# import models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import  RandomForestClassifier
from xgboost import XGBClassifier

# Using the ConsensML class

## Overview:

1) Finds intersection of features deemed important by Lasso Regression, XGBoost, and Random Forest

2) For each model: tunes paramters on 5 fold CV -> fits tuned model on 5 fold CV -> identifies features deemed "important" in all 5 folds

3) Runs a random forest Boruta test

4) Combines the discovered features in five different ways (Boruta only, Boruta + Intersection, Boruta + Lasso KF, Boruta + Lasso/XGB KF, Intersection + Lasso/XGB KF) then fits a model to see which features are best

In [4]:
from consensus import ConsensusML

In [6]:
lasso = LogisticRegression(penalty='l1', solver='saga', max_iter=10000)
rf = RandomForestClassifier(n_estimators=1000, max_depth=20, random_state=8, n_jobs=-1)
xgb = XGBClassifier(learning_rate = 0.01, max_depth = 3, n_estimators = 700, random_state=8, n_jobs=-1)

cm = ConsensusML(lasso, rf, xgb, X, y) #call model (should just initialize then go to cm.add_models)

model_dict = cm.feat_selection3(X_train, X_test, y_train, y_test) #fit models -> log loss/genes/total genes

gene_intersection, lasso_weights, xgb_feature_importance = cm.feature_intersection_weights() #find intersection


param_list =  [ {'C': range(1, 4, 1)}, {'max_depth': range(3, 7), 'n_estimators': range(200, 800, 200)},
               {'max_depth': range(3, 10)} ] #list of paramters for CV (lasso, xgb, rf)

cm.kfold_tune(param_list) #use the param list to tune the models

performances, weights = cm.kfold_weights() # fit tuned models on kfold --> return weight/log loss

cm.boruta_selection() # run boruta test

final_feats = cm.feature_consensus() # intersection of all previous tests

model = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=8, n_jobs=-1) #call model to compare

feature_performances, best_dataframe = cm.best_combo(model) #find the best combination of consensus

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  1.7min finished


Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.2s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   26.0s finished


Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:  4.6min finished


Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	1984
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	1984
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	1984
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	1984
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	1984
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	1984
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	1984
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	581
Rejected: 	1403


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	9 / 100
Confirmed: 	65
Tentative: 	516
Rejected: 	1403


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	10 / 100
Confirmed: 	65
Tentative: 	516
Rejected: 	1403


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	11 / 100
Confirmed: 	65
Tentative: 	516
Rejected: 	1403


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	12 / 100
Confirmed: 	71
Tentative: 	342
Rejected: 	1571


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	13 / 100
Confirmed: 	71
Tentative: 	342
Rejected: 	1571


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	14 / 100
Confirmed: 	71
Tentative: 	342
Rejected: 	1571


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	15 / 100
Confirmed: 	71
Tentative: 	342
Rejected: 	1571


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	16 / 100
Confirmed: 	75
Tentative: 	259
Rejected: 	1650


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	17 / 100
Confirmed: 	75
Tentative: 	259
Rejected: 	1650


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	18 / 100
Confirmed: 	75
Tentative: 	259
Rejected: 	1650


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	19 / 100
Confirmed: 	77
Tentative: 	218
Rejected: 	1689


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	20 / 100
Confirmed: 	77
Tentative: 	218
Rejected: 	1689


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	21 / 100
Confirmed: 	77
Tentative: 	218
Rejected: 	1689


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	22 / 100
Confirmed: 	77
Tentative: 	190
Rejected: 	1717


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	23 / 100
Confirmed: 	77
Tentative: 	190
Rejected: 	1717


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	24 / 100
Confirmed: 	77
Tentative: 	190
Rejected: 	1717


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	25 / 100
Confirmed: 	77
Tentative: 	190
Rejected: 	1717


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	26 / 100
Confirmed: 	77
Tentative: 	176
Rejected: 	1731


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	27 / 100
Confirmed: 	77
Tentative: 	176
Rejected: 	1731


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	28 / 100
Confirmed: 	77
Tentative: 	176
Rejected: 	1731


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	29 / 100
Confirmed: 	79
Tentative: 	160
Rejected: 	1745


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	30 / 100
Confirmed: 	79
Tentative: 	160
Rejected: 	1745


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	31 / 100
Confirmed: 	79
Tentative: 	160
Rejected: 	1745


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	32 / 100
Confirmed: 	79
Tentative: 	154
Rejected: 	1751


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	33 / 100
Confirmed: 	79
Tentative: 	154
Rejected: 	1751


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	34 / 100
Confirmed: 	80
Tentative: 	142
Rejected: 	1762


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	35 / 100
Confirmed: 	80
Tentative: 	142
Rejected: 	1762


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	36 / 100
Confirmed: 	80
Tentative: 	142
Rejected: 	1762


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	37 / 100
Confirmed: 	82
Tentative: 	133
Rejected: 	1769


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	38 / 100
Confirmed: 	82
Tentative: 	133
Rejected: 	1769


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	39 / 100
Confirmed: 	82
Tentative: 	133
Rejected: 	1769


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	40 / 100
Confirmed: 	84
Tentative: 	124
Rejected: 	1776


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	41 / 100
Confirmed: 	84
Tentative: 	124
Rejected: 	1776


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	42 / 100
Confirmed: 	84
Tentative: 	124
Rejected: 	1776


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	43 / 100
Confirmed: 	84
Tentative: 	124
Rejected: 	1776


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	44 / 100
Confirmed: 	84
Tentative: 	121
Rejected: 	1779


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	45 / 100
Confirmed: 	84
Tentative: 	121
Rejected: 	1779


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	46 / 100
Confirmed: 	84
Tentative: 	116
Rejected: 	1784


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	47 / 100
Confirmed: 	84
Tentative: 	116
Rejected: 	1784


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	48 / 100
Confirmed: 	84
Tentative: 	116
Rejected: 	1784


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	49 / 100
Confirmed: 	84
Tentative: 	111
Rejected: 	1789


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	50 / 100
Confirmed: 	84
Tentative: 	111
Rejected: 	1789


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	51 / 100
Confirmed: 	84
Tentative: 	105
Rejected: 	1795


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	52 / 100
Confirmed: 	84
Tentative: 	105
Rejected: 	1795


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	53 / 100
Confirmed: 	84
Tentative: 	105
Rejected: 	1795


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	54 / 100
Confirmed: 	84
Tentative: 	103
Rejected: 	1797


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	55 / 100
Confirmed: 	84
Tentative: 	103
Rejected: 	1797


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	56 / 100
Confirmed: 	84
Tentative: 	103
Rejected: 	1797


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	57 / 100
Confirmed: 	84
Tentative: 	100
Rejected: 	1800


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	58 / 100
Confirmed: 	84
Tentative: 	100
Rejected: 	1800


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	59 / 100
Confirmed: 	84
Tentative: 	95
Rejected: 	1805


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	60 / 100
Confirmed: 	84
Tentative: 	95
Rejected: 	1805


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	61 / 100
Confirmed: 	84
Tentative: 	95
Rejected: 	1805


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	62 / 100
Confirmed: 	84
Tentative: 	94
Rejected: 	1806


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	63 / 100
Confirmed: 	84
Tentative: 	94
Rejected: 	1806


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	64 / 100
Confirmed: 	84
Tentative: 	94
Rejected: 	1806


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	65 / 100
Confirmed: 	84
Tentative: 	91
Rejected: 	1809


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	66 / 100
Confirmed: 	84
Tentative: 	91
Rejected: 	1809


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	67 / 100
Confirmed: 	84
Tentative: 	82
Rejected: 	1818


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	68 / 100
Confirmed: 	84
Tentative: 	82
Rejected: 	1818


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	69 / 100
Confirmed: 	84
Tentative: 	82
Rejected: 	1818


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	70 / 100
Confirmed: 	84
Tentative: 	80
Rejected: 	1820


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	71 / 100
Confirmed: 	84
Tentative: 	80
Rejected: 	1820


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	72 / 100
Confirmed: 	84
Tentative: 	77
Rejected: 	1823


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	73 / 100
Confirmed: 	84
Tentative: 	77
Rejected: 	1823


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	74 / 100
Confirmed: 	84
Tentative: 	77
Rejected: 	1823


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	75 / 100
Confirmed: 	84
Tentative: 	73
Rejected: 	1827


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	76 / 100
Confirmed: 	84
Tentative: 	73
Rejected: 	1827


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	77 / 100
Confirmed: 	84
Tentative: 	71
Rejected: 	1829


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	78 / 100
Confirmed: 	84
Tentative: 	71
Rejected: 	1829


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	79 / 100
Confirmed: 	84
Tentative: 	71
Rejected: 	1829


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	80 / 100
Confirmed: 	84
Tentative: 	68
Rejected: 	1832


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	81 / 100
Confirmed: 	84
Tentative: 	68
Rejected: 	1832


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	82 / 100
Confirmed: 	84
Tentative: 	68
Rejected: 	1832


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	83 / 100
Confirmed: 	84
Tentative: 	67
Rejected: 	1833


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	84 / 100
Confirmed: 	84
Tentative: 	67
Rejected: 	1833


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	85 / 100
Confirmed: 	84
Tentative: 	67
Rejected: 	1833


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	86 / 100
Confirmed: 	84
Tentative: 	67
Rejected: 	1833


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	87 / 100
Confirmed: 	84
Tentative: 	67
Rejected: 	1833


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	88 / 100
Confirmed: 	84
Tentative: 	66
Rejected: 	1834


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	89 / 100
Confirmed: 	84
Tentative: 	66
Rejected: 	1834


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	90 / 100
Confirmed: 	84
Tentative: 	62
Rejected: 	1838


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	91 / 100
Confirmed: 	84
Tentative: 	62
Rejected: 	1838


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	92 / 100
Confirmed: 	84
Tentative: 	62
Rejected: 	1838


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	93 / 100
Confirmed: 	85
Tentative: 	60
Rejected: 	1839


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	94 / 100
Confirmed: 	85
Tentative: 	60
Rejected: 	1839


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	95 / 100
Confirmed: 	85
Tentative: 	60
Rejected: 	1839


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	96 / 100
Confirmed: 	85
Tentative: 	60
Rejected: 	1839


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	97 / 100
Confirmed: 	85
Tentative: 	60
Rejected: 	1839


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	98 / 100
Confirmed: 	85
Tentative: 	59
Rejected: 	1840


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	99 / 100
Confirmed: 	85
Tentative: 	59
Rejected: 	1840


BorutaPy finished running.

Iteration: 	100 / 100
Confirmed: 	85
Tentative: 	14
Rejected: 	1840
(137, 85)
[('Boruta', 0.2320984242668982)]


NameError: name 'np' is not defined

In [8]:
feature_performances, best_dataframe = cm.best_combo(model)

[('Boruta', 0.17996442482125813)]


In [12]:
feature_performances

{'Boruta': {'Log Loss': 0.17996442482125813,
  'Model Weights': array([0.00316054, 0.02037491, 0.00329472, 0.00359504, 0.00146195,
         0.01374504, 0.0074932 , 0.01502901, 0.00640196, 0.00302314,
         0.03476624, 0.00764533, 0.06997857, 0.00773679, 0.00834866,
         0.00320989, 0.03149242, 0.05610372, 0.00709896, 0.02344204,
         0.00195659, 0.03590717, 0.01051769, 0.00507905, 0.0110827 ,
         0.0046177 , 0.00638565, 0.00554783, 0.03865334, 0.00219166,
         0.02310788, 0.00239922, 0.00483039, 0.00576532, 0.01214881,
         0.00641859, 0.03958241, 0.03898611, 0.01622602, 0.00417117,
         0.01505853, 0.00725237, 0.01098657, 0.00360894, 0.00935465,
         0.01016532, 0.05718348, 0.00562912, 0.00926734, 0.05008643,
         0.00418294, 0.01508668, 0.006296  , 0.01965817, 0.01536025,
         0.04671163, 0.00563171, 0.00992244, 0.01120009, 0.06483991,
         0.01412703, 0.00281009, 0.00260289])}}