In [1]:
import pandas as pd
from loaddata import defineTestSet,defineResponse,defineFeatures,defineTrainingSets,defineSplits
import numpy as np
import pickle

from tools.model_roc import get_model_roc

from sklearn.model_selection import RandomizedSearchCV

from config import rand_var,ml_dir,ml_model_filename,ensemble_dir

import os
from datetime import datetime

In [2]:
whichFeats='chemo'
her2=0
rcut = 1
feats = defineFeatures(whichFeats, her2=her2)

with open(ml_dir+ml_model_filename,'rb') as w:
    ml_dict=pickle.load(w)

In [3]:


df_train = pd.read_csv('inputs/training_df.csv')

Xtrain, ytrainCateg, ytrainScore, ytrainID = defineTrainingSets(df_train, feats, her2=her2)

splits = defineSplits(Xtrain, ytrainCateg)

ytrain_pCR = defineResponse(df_train, 'pCR', her2=her2)

In [14]:
from sklearn.ensemble import VotingClassifier

used_model_list = [
    ('Logistic Regression',ml_dict["Logistic Regression"]),
    ('Random Forest Classifier',ml_dict["Random Forest Classifier"]),
    ('Support Vector Classifier',ml_dict['Support Vector Classifier']),
    ('Gradient Boosting',ml_dict['Support Vector Classifier']),
    ('Gaussean Naive Bayes',ml_dict['Gaussean Naive Bayes']),
    ('Adaptive Boosting Classifier',ml_dict['Adaptive Boosting Classifier']),
    ('k-Nearest Neighbors',ml_dict['k-Nearest Neighbors']),

    
    
]



min_weight =0
max_weight = 3

weights_of_models = range(min_weight,max_weight,1)

possible_combinations = [
    [w1,w2,w3,w4,w5,w6,w7]
    for w1 in weights_of_models
    for w2 in weights_of_models
    for w3 in weights_of_models
    for w4 in weights_of_models
    for w5 in weights_of_models
    for w6 in weights_of_models
    for w7 in weights_of_models
]

filtered_combinations = [i for i in possible_combinations if any(i)]


param_grid = {
 'weights':filtered_combinations   
}

In [15]:
ensemble_model_weighted = VotingClassifier(estimators=used_model_list, voting='soft')

ensemble_search = RandomizedSearchCV(ensemble_model_weighted, param_grid, cv=splits,scoring='roc_auc',return_train_score=True, n_jobs=-1, verbose=0,n_iter=1500,random_state=rand_var)

ensemble_search.fit(Xtrain,ytrain_pCR)

best_ensemble_model = ensemble_search.best_estimator_

ensemble_model_weight = ensemble_search.best_params_['weights']

print(ensemble_model_weight)



In [18]:


os.makedirs(ensemble_dir, exist_ok=True)

tm = datetime.now()

filename = "ensemble_models_made/Modelnum_{}_random_{}_Feats_{}_Date_{}_{}_{}_{}.p".format(len(ml_dict),rand_var,whichFeats,tm.year,tm.month,tm.day,tm.strftime("%H_%M_%S"))


with open(filename,'wb') as w:
    pickle.dump(best_ensemble_model,w)

In [4]:
#prepare test data

df_test_pCR_pos = pd.read_csv('inputs//testing_her2pos_df.csv')
df_test_pCR_neg = pd.read_csv('inputs//testing_her2neg_df.csv')

x_test_pCR_pos = defineTestSet(df_test_pCR_pos,feats,her2=her2)
y_test_pCR_pos = defineResponse(df_test_pCR_pos, 'pCR', her2=her2)

x_test_pCR_neg = defineTestSet(df_test_pCR_neg,feats,her2=her2)
y_test_pCR_neg = defineResponse(df_test_pCR_neg, 'pCR', her2=her2)

y_test_comb = pd.concat([y_test_pCR_pos,y_test_pCR_neg])



In [5]:
model_roc_results = get_model_roc(x_test_pCR_pos,x_test_pCR_neg,y_test_comb,ml_dict)



    

In [11]:


for i,(tr,ts) in enumerate(splits):
    
    xtrain_tr = Xtrain.iloc[tr,:]
    ytrain_tr = ytrain_pCR.iloc[tr]
    
    
    ml_dict["Logistic Regression"].fit(xtrain_tr,ytrain_tr) 
    ml_dict["Random Forest Classifier"].fit(xtrain_tr,ytrain_tr) 
    ensemble_model.fit(xtrain_tr,ytrain_tr) 
    
    y_pred_logres = ml_dict["Logistic Regression"].predict(Xtrain.iloc[ts,:])
    y_pred_gb = ml_dict["Random Forest Classifier"].predict(Xtrain.iloc[ts,:])
    #y_pred_ensemble = ensemble_model.predict(Xtrain.iloc[ts,:])
 
    #fp_rate, tp_rate, thresholds = roc_curve(ytrain_pCR.iloc[ts], y_pred)
    
    #roc_auc = auc(fp_rate, tp_rate)
    #print(roc_auc)
    
    break


