### Doublet Classifier

In [93]:
import numpy as np
import json
from collections import OrderedDict
from scipy.spatial import distance
from numpy.linalg import eigh
from matplotlib import pyplot as pl
import sklearn.metrics as metrics
import xgboost as xgb
import numpy.random as random
import sys
import pandas as pd
np.random.seed(132)

In [102]:
def run_xgboost(dtrain, dtest, labels_train, labels_test):
    xg_params = {
        "objective": "binary:logistic",
        "booster" : "gbtree",
        "eval_metric" : "logloss",
        "eta": random.uniform(0.01, 0.3),
        "max_depth": random.randint(2, 4),
        "subsample": random.uniform(0.5, 0.95),
        "colsample_bytree": random.uniform(0.5, 0.95),
        "silent": 1,
        "seed": 0,
        "nthread" : 5
   }
    num_boost_round = 1000
    early_stopping_rounds = 25  
    evallist = [(dtest, 'test')]
    
    bst = xgb.train(xg_params, dtrain, 1000, evals=evallist, early_stopping_rounds = 20, verbose_eval=False)    
    log_loss = bst.best_score
    
    prediction = bst.predict(dtrain, ntree_limit = bst.best_ntree_limit)
    # print('   log_loss_train', logloss(prediction, labels_train))
    print('   auc_train', auc(prediction, labels_train))
    prediction = bst.predict(dtest, ntree_limit = bst.best_ntree_limit)
    # print('   log_loss_test', logloss(prediction, labels_test))
    print('   auc_test', auc(prediction, labels_test))
    
    return log_loss, xg_params, bst

def logloss(prediction, label):
    eps = 1e-7
    prediction = np.maximum(np.minimum(prediction, 1-eps), eps)
    return -np.mean(label*np.log(prediction) + (1-label)*np.log(1-prediction))

def precision_recall(prediction, label):
    precision = metrics.precision_score(label, prediction)
    recall = metrics.precision_score(label, prediction)    
    return precision, recall

def auc(prediction, label):
    fpr, tpr, thresholds = metrics.roc_curve(label, prediction)
    return metrics.auc(fpr, tpr)

def sort_and_reverse_1Darray(array):
    return np.sort(array)[::-1]

def run(X_train, labels_train, X_test,  labels_test,
        bin_size=0.05, kernel_width=0.2, xg_max_depth=2, xg_eta=1, xg_num_round=2):

    dtrain = xgb.DMatrix(X_train.copy(), label=labels_train.copy())
    dtest  = xgb.DMatrix(X_test.copy(), label=labels_test.copy())   
    
    scores_params_bsts = []
    for i in range(5):
        score_params_bst = run_xgboost(dtrain, dtest, labels_train, labels_test)
        scores_params_bsts.append(score_params_bst)
    
    sorted_by_score = sorted(scores_params_bsts, key=lambda tup: tup[0])
    print("best_score: ", sorted_by_score[0][0])
    print("best params: ", sorted_by_score[0][1])
    best_bst = sorted_by_score[0][2]

    print('-----------------------')
    predictions_test = best_bst.predict(dtest, ntree_limit = best_bst.best_ntree_limit)
    logloss_test = logloss(np.array(predictions_test), labels_test)
    print('log_loss_test', logloss(predictions_test, labels_test))
    print('auc_test', auc(predictions_test, labels_test))    

    print('-----------------------')
    # predict holdout
#     sample_submission = pd.read_csv(sample_submission_lst_path)
#     dpred  = xgb.DMatrix(data['ho']['all_candidates_scores'].copy())
#     predictions_sub = best_bst.predict(dpred, ntree_limit = best_bst.best_ntree_limit)
#     for patient in sample_submission['id'].values.tolist():
#         sample_submission['cancer'][sample_submission['id']==patient] = float(predictions_sub[data['ho']['patients']==patient][0])
#     sample_submission.to_csv('submission.csv',index=False,columns=['id','cancer'])

In [103]:
df1 = pd.read_csv('shared_scratch/group6/feature/rahul.dataset1_train.txt', delimiter=' ')
print(len(df1))
df1.head()

5020


Unnamed: 0,rUMI,V2,cs,prob
AAACGGGTCATATCGG,1.019075,singlet,15,0.886161
AGGCCGTCAGCTGTGC,0.703065,singlet,4,0.961916
GTCATTTAGTTGAGAT,0.757705,singlet,0,0.965773
ATCGAGTGTGTTCTTT,1.446094,singlet,9,0.828168
CGAGCACGTAACGACG,3.659541,doublet,41,0.013753


In [104]:
df2 = pd.read_csv('shared_scratch/group6/feature/dataset1_sat_stats.csv')
df2.set_index('sampleID', inplace=True)
print(len(df2))
df2.head()

10000


Unnamed: 0_level_0,umiCount,geneCount,meanReadsPerUmi,expUmiCount,expGeneCount,umiSatRatio,geneSatRatio,targetReads,fractionUniqueUmi,expUmiFitA,expUmiFitB,expUmiFitC,expGeneFitA,expGeneFitB,expGeneFitC
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
AAACCTGAGAAGGCCT,1783,753,1.756029,2265.296833,861.992728,0.787093,0.873557,8532,0.0129,2274.373239,2122.41729,-9.076406,829.964259,1270.972111,32.028469
AAACCTGAGAGCTGCA,3377,1194,1.799526,4354.730608,1362.173068,0.775479,0.876541,16573,0.021321,4373.884792,4120.614022,-19.154184,1306.437278,2570.864732,55.735789
AAACCTGAGAGTCTGG,3313,1233,1.797766,4260.179059,1404.559014,0.777667,0.877856,16304,0.015092,4276.660072,4056.343393,-16.481013,1353.897461,2450.735771,50.661553
AAACCTGAGGATGCGT,3349,1255,1.799343,4318.817952,1436.70639,0.775444,0.873526,16458,0.016721,4340.772977,4088.633152,-21.955026,1379.255812,2587.925511,57.450578
AAACCTGAGTTCGCAT,1554,592,1.790219,1956.861375,670.377751,0.794129,0.883084,7272,0.019305,1966.580495,1806.829664,-9.719119,634.650303,1124.116,35.727448


In [105]:
df2_test = df2.loc[df1.index, :]

In [111]:
df = df1
df[df2.columns] = df2[df2.columns]

In [112]:
features = ['rUMI', 'cs'] + list(df2.columns)
print(features)

['rUMI', 'cs', 'umiCount', 'geneCount', 'meanReadsPerUmi', 'expUmiCount', 'expGeneCount', 'umiSatRatio', 'geneSatRatio', 'targetReads', 'fractionUniqueUmi', 'expUmiFitA', 'expUmiFitB', 'expUmiFitC', 'expGeneFitA', 'expGeneFitB', 'expGeneFitC']


In [113]:
training_size = int(0.8 * len(df))
split_indices = np.zeros(len(df), dtype=bool)
split_indices[np.random.choice(len(df), size=training_size, replace=False)] = True

X_train = df[features].values[split_indices]
labels_train = df[['V2']].values[split_indices] == 'doublet'
X_validation = df[features].values[~split_indices]
labels_validation = df[['V2']].values[~split_indices] == 'doublet'

In [114]:
run(X_train, labels_train, X_validation,  labels_validation)

   auc_train 0.95505225263
   auc_test 0.91300643918
   auc_train 0.950656808063
   auc_test 0.911772654634
   auc_train 0.967806013403
   auc_test 0.914766050949
   auc_train 0.948564598246
   auc_test 0.907839231998
   auc_train 0.949136977544
   auc_test 0.912084038353
best_score:  0.271063
best params:  {'eta': 0.15123781285502755, 'nthread': 5, 'seed': 0, 'max_depth': 3, 'eval_metric': 'logloss', 'subsample': 0.6013568595703785, 'booster': 'gbtree', 'silent': 1, 'colsample_bytree': 0.8506024688369762, 'objective': 'binary:logistic'}
-----------------------
log_loss_test 0.875266518681
auc_test 0.91300643918
-----------------------
