In [11]:
args =  ["classifier.py","/Users/leahbriscoe/Documents/MicroBatch/microbatch_vc", "CRC_k6&CRC_k7", "kmer",\
         "BatchCorrected",  "bin_crc_normal",1,0,1,1] 


In [62]:
import sys
import pandas as pd
import utils
import numpy as np
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection 
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, cross_val_score
import statsmodels.formula.api as sm
from sklearn.metrics import roc_auc_score
from collections import Counter



In [58]:
greater_folder = args[1] # what folder do you save your different datasets in
study_names = args[2].split("&")  # what is the name of the dataset (tells the program which folder to check)
data_type = args[3] # type of data. kmer vs OTU

prefix_name = args[4] # what is the prefix of the file name
column_of_interest = args[5] # what is the phenotype you are predicting (use the same name in the column of the metadata you want to predict), this programs reads from metadata.txt

norm_input = bool(int(args[6]))
map_with_accession = bool(int(args[7]))

if len(args) > 5:
    label_pos_or_neg = int(args[8]) # do you want to treat CRC as positive class or negative class? 
    target_label = args[9] # phenotype representing positive class or negative class? eg. CRC eg. H
    print(target_label)
else:
    label_pos_or_neg = 1
    target_label = 1
use_domain_pheno = False # for when running raw to compare to domain pheno
data_folders = [greater_folder + "/data/" + study_name + "/" for study_name in study_names]   

num_pcs = 20

rf_params = ['criterion','max_features','min_samples_leaf', 'n_estimators']

1


In [14]:
#########################################################################
###### COMMENTARY: load data from your k-mer matrix, load metadata ######
#########################################################################

feature_table_dict = utils.load_feature_table(data_folders,data_type = data_type)
metadata = pd.read_csv(data_folders[0] + "metadata.txt",delimiter="\t")

if norm_input:
    for d in range(len(study_names)):
        #feature_table_dict[d] = normalize(np.array(feature_table_dict[d].transpose()), axis = 1, norm = 'l1')
        temp = pd.DataFrame(normalize(feature_table_dict[d].transpose(), axis = 1, norm = 'l1').transpose())
        temp.index = feature_table_dict[d].index
        temp.columns = feature_table_dict[d].columns
        feature_table_dict[d] = temp



In [15]:
if "AGP" in study_names[0]:
    for d in range(len(study_names)):
        tissue_samples = metadata.index[metadata['body_habitat.x'] == "UBERON:feces"]

        feature_table_dict[d] = feature_table_dict[d][tissue_samples]
        metadata = metadata.loc[tissue_samples]



In [16]:
labels = metadata[column_of_interest]


In [32]:
def pca_regression(y,X):
    model = sm.OLS(y,X)
    results = model.fit()
    predictedValues = results.predict()
    residuals = y - predictedValues
    return(residuals)

def RF_grid_search(data,labels,param_dict):
    rf = RandomForestClassifier()
    clf = GridSearchCV(rf, param_dict,scoring="roc_auc")
    clf.fit(data, labels)

    print("Best parameters set found on development set:")
    best_params = clf.best_params_)
    return clf,best_params

def RF_cv(data,labels,param_dict):
    clf = RandomForestClassifier(max_depth=5, random_state=0,n_estimators = param_dict['n_estimators'],\
            criterion = param_dict['criterion'],min_samples_leaf = param_dict['min_samples_leaf'],\
                           max_features = param_dict['max_features'])
    results = cross_val_score(clf,X=data,y=labels,scoring="roc_auc")
    return clf
   

In [18]:
# outline
# for data_table:
    # test train split
    # for train
    # for PC number:
        # regress out PCs
        # for grid cell:
            # get accuracy:
        # get max accuracy
    # get max accuracy across PCs
    # for test
    # get the test accuracy with best parameters
    # save best grid and best PCs

In [19]:
# get PC scores
pc_table_dict = dict()
feature_table_np = dict()
labels_np = dict()
pca = PCA(n_components=num_pcs,svd_solver='randomized')
for d in range(len(study_names)):
    temp = feature_table_dict[d].transpose()
    pca.fit(temp)
    pc_table_dict[d] = pca.transform(temp)
    
    feature_table_np[d] = np.array(feature_table_dict[0])
    labels_np[d] = np.array(labels)
    
  


In [20]:
n_splits = 2
n_repeats = 1
rskf = model_selection.RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=123)
parameter_dict = {'n_estimators':[100, 1000],'criterion': ['entropy','gini'],'min_samples_leaf': [5],\
                 'max_features':[0.3]}    

In [77]:
# Regress out PCs

# For each dataset (kmer size)
for d in range(1):#len(study_names)):
    
    results_dict = dict()
    X = feature_table_np[d].transpose()
    y = labels_np[d]
    pc_scores = pc_table_dict[d] # get pc scores
    na_mask = pd.isna(y)
    
    X = X[~na_mask,:]
    y = y[~na_mask]
    pc_scores = pc_scores[~na_mask,:]
    # for each test train split in 5 fold cross validation
    for train_index, test_index in rskf.split(X, y):
        print(train_index)
        print(test_index)
        
        X_train, X_test = X[train_index,], X[test_index,]
        y_train, y_test = y[train_index], y[test_index]
        pc_scores_train, pc_scores_test =  pc_scores[train_index], pc_scores[test_index]
        
        # for each PC we regress out 
        for p in range(0,num_pcs):
            if p == 0:
                X_train_corrected = X_train
            else:
                X_train_corrected = pca_regression(X_train,pc_scores_train[:,0:p])
            
            # perform grid search on train
            best_train_model, best_params = RF_grid_search(X_train_corrected, y_train,parameter_dict)
            y_train_pred_prob = best_train_model.predict_proba(X_train_corrected)
            results_dict['train_auc'] = roc_auc_score(y_true = y_train, y_score = y_train_pred_prob[:,1])
            
            # perform cv result on test with best param
            #best_train_model, best_params = RF_grid_search(X_train_corrected, y_train,parameter_dict)
            #y_train_pred_prob = best_train_model.predict_proba(X_train_corrected)
            
            #results_dict['test_auc'] = roc_auc_score(y_true = y_test, y_score = y_test_pred_prob[:,1])


        
        
        
    

[  0   1   2   3   4   6   7  12  14  19  22  23  25  29  30  31  32  33
  36  39  42  44  45  46  47  48  52  55  57  62  64  65  70  71  72  73
  76  78  79  80  81  82  84  88  89  90  91  92  93  97  98  99 100 101
 102 103 106 107 108 109 112 118 119 121 123 124 125 126 128 135 136 138
 143 144 146 149 151 153 157 160 161 162 170 171 174 177 178 180 182 184
 185 188 189 191 193 194 195 196 198 200 203 204 205 207 208 209 210 212
 214 215 216 217 218 220 221 222 228 229 231 232 233 237 240 241 242 243
 246 247 251 253 256 257 258 261 262 263 264 270 271 275 278 279 281 282
 283 284 285 288 295 297 301 303 307 309 311 313 314 316 318 319 320 322
 323 324 325 327 328 330 331 332 335 336 337 339 342 343 345 346 349 350
 354 359 360 361 364 365 370 371 373 376 381 382 385 386 388 391 394 396
 398 402 403 408 412 413 418 419 421 422 423 425 426 431 433 434 436 438
 439 444 445 446 448 449 450 453 454 455 457 459 461 462 463 464]
[  5   8   9  10  11  13  15  16  17  18  20  21  24  26  

NameError: name 'RF_grid_search' is not defined

In [66]:
clf = RandomForestClassifier(max_depth=5, random_state=0,n_estimators = param_dict['n_estimators'],\
            criterion = param_dict['criterion'],min_samples_leaf = param_dict['min_samples_leaf'],\
                           max_features = param_dict['max_features'])
    results = cross_val_score(clf,X=data,y=labels,scoring="roc_auc")



Counter({1.0: 99, 0.0: 134})

In [74]:
y_train_pred[0:5]

array([1., 1., 1., 1., 1.])

In [65]:
Counter(y_train_pred)

Counter({1.0: 98, 0.0: 135})

ValueError: bad input shape (233, 2)