In [37]:
import numpy as np
import pandas as pd
import math
import random
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# preprocessing
import sklearn
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, learning_curve, ShuffleSplit
from sklearn.model_selection import cross_val_predict as cvp
from sklearn import metrics
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, accuracy_score, confusion_matrix, explained_variance_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel, SelectKBest, RFE, chi2
from sklearn.neighbors import KNeighborsClassifier

import warnings
warnings.filterwarnings("ignore")

## IMPORT DATASET

In [38]:
df1 = pd.read_csv("chipVariantCalling_run1.tsv", sep='\t')
df2 = pd.read_csv("chipVariantCalling_run2.tsv", sep='\t')

df1

Unnamed: 0,d.barcode,DP,VD,AF,HIAF,IMPACT,SYMBOL,loci,sampleTimePt,gender,MSID,BIAS,REFBIAS,VARBIAS,QUAL,ODDRATIO,chipOrControl
0,4010289633,7281,26,0.0036,0.0033,MODERATE,GNB1,chr1:1747196_T/C,Baseline,Male,MS2083,2:2,3644:3596,12:14,34.0,1.182210,CHIP
1,4010289633,7281,26,0.0036,0.0033,MODERATE,GNB1,chr1:1747196_T/C,Baseline,Male,MS2083,2:2,3644:3596,12:14,34.0,1.182210,CHIP
2,4010289633,7281,26,0.0036,0.0033,MODERATE,GNB1,chr1:1747196_T/C,Baseline,Male,MS2083,2:2,3644:3596,12:14,34.0,1.182210,CHIP
3,4010289633,7281,26,0.0036,0.0033,MODERATE,GNB1,chr1:1747196_T/C,Baseline,Male,MS2083,2:2,3644:3596,12:14,34.0,1.182210,CHIP
4,4010289633,7282,29,0.0040,0.0036,MODERATE,GNB1,chr1:1747250_T/C,Baseline,Male,MS2083,2:2,3639:3606,15:14,32.7,1.061729,CHIP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1137944,4010290016,731,2,0.0027,0.0028,MODERATE,ZFY,chrY:2848011_C/T,Blank,Blank,,2:2,371:357,1:1,31.0,1.039170,
1137945,4010290016,731,2,0.0027,0.0028,MODERATE,ZFY,chrY:2848029_C/T,Blank,Blank,,2:2,367:359,1:1,37.0,1.022260,
1137946,4010290016,731,2,0.0027,0.0028,MODERATE,ZFY,chrY:2848029_C/T,Blank,Blank,,2:2,367:359,1:1,37.0,1.022260,
1137947,4010290016,731,2,0.0027,0.0028,MODERATE,ZFY,chrY:2848029_C/T,Blank,Blank,,2:2,367:359,1:1,37.0,1.022260,


In [39]:
unknown1 = df1[(df1.chipOrControl == "Unknown")]
unknown2 = df2[(df2.chipOrControl == "Unknown")]

In [40]:
df1 = df1[(df1.chipOrControl != "Blank") & (df1.chipOrControl != "Unknown")]
df1 = df1.dropna(subset=['chipOrControl'])
df1.drop(['MSID', 'd.barcode'], axis=1, inplace=True)
# df.drop(['sampleTimePt'], axis = 1, inplace=True)

df2 = df2[(df2.chipOrControl != "Blank") & (df2.chipOrControl != "Unknown")]
df2 = df2.dropna(subset=['chipOrControl'])
df2.drop(['MSID', 'd.barcode'], axis=1, inplace=True)
# df2.drop(['sampleTimePt'], axis = 1, inplace=True)

In [41]:
genes = df1.SYMBOL.unique()
genes2 = df2.SYMBOL.unique()

### Columns `BIAS`, `REFBIAS` and `VARBIAS` are strings. Change to floats.

In [42]:
def ratio_to_int(string):
    a, b = string.split(":")
    if int(b) == 0:
        return 0
    else:
        return int(a) / int(b)

In [43]:
# for dataset 1
bias = []
refbias = []
varbias = []

for ratio in df1.BIAS.array:
    bias.append(ratio_to_int(ratio)) 

for ratio in df1.REFBIAS.array:
    refbias.append(ratio_to_int(ratio)) 

for ratio in df1.VARBIAS.array:
    varbias.append(ratio_to_int(ratio)) 
    
bias = pd.Series(bias)
refbias = pd.Series(refbias)
varbias = pd.Series(varbias)

df1['BIAS'] = bias.values
df1['REFBIAS'] = refbias.values
df1['VARBIAS'] = varbias.values

In [44]:
# for dataset 2
bias = []
refbias = []
varbias = []

for ratio in df2.BIAS.array:
    bias.append(ratio_to_int(ratio)) 

for ratio in df2.REFBIAS.array:
    refbias.append(ratio_to_int(ratio)) 

for ratio in df2.VARBIAS.array:
    varbias.append(ratio_to_int(ratio)) 
    
bias = pd.Series(bias)
refbias = pd.Series(refbias)
varbias = pd.Series(varbias)

df2['BIAS'] = bias.values
df2['REFBIAS'] = refbias.values
df2['VARBIAS'] = varbias.values

In [45]:
objs = {}
lst = []
for i in df1.columns:
    if df1.dtypes[i] == object:
        if len(df1[f"{i}"].unique()) <= 100:
            objs[i] = len(df1[f"{i}"].unique())
            lst.append(i)

In [46]:
objs = {}
lst = []
for i in df2.columns:
    if df2.dtypes[i] == object:
        if len(df2[f"{i}"].unique()) <= 100:
            objs[i] = len(df2[f"{i}"].unique())
            lst.append(i)

In [47]:
for i in lst:
    k = i
    dict = {}
    df_new = df1
    for ix, i in zip(range(len(df_new[i].unique())), df_new[i].unique() ):
        dict[i] = ix
    df1 = df1.replace({f"{k}": dict})
    df1[f"{k}"] = df1[f"{k}"].astype(str).astype(float)

In [48]:
for i in lst:
    k = i
    dict = {}
    df2_new = df2
    for ix, i in zip(range(len(df2_new[i].unique())), df2_new[i].unique() ):
        dict[i] = ix
    df2 = df2.replace({f"{k}": dict})
    df2[f"{k}"] = df2[f"{k}"].astype(str).astype(float)

In [49]:
# LOCI not required in final model
df1.drop(['loci'], axis=1, inplace=True)
df2.drop(['loci'], axis=1, inplace=True)

In [50]:
df1.drop_duplicates(inplace=True)
df2.drop_duplicates(inplace=True)

### `REFBIAS` Outliers

In [51]:
df1['REFBIAS_naturalLog'] = np.log(df1['REFBIAS'])
df2['REFBIAS_naturalLog'] = np.log(df2['REFBIAS'])
df1

Unnamed: 0,DP,VD,AF,HIAF,IMPACT,SYMBOL,sampleTimePt,gender,BIAS,REFBIAS,VARBIAS,QUAL,ODDRATIO,chipOrControl,REFBIAS_naturalLog
0,7281,26,0.0036,0.0033,0.0,0.0,0.0,0.0,1.0,1.013348,0.857143,34.0,1.182210,0.0,0.013260
4,7282,29,0.0040,0.0036,0.0,0.0,0.0,0.0,1.0,1.009151,1.071429,32.7,1.061729,0.0,0.009110
8,7282,24,0.0033,0.0033,0.0,0.0,0.0,0.0,1.0,1.005268,1.000000,34.9,1.005270,0.0,0.005254
12,178,2,0.0112,0.0120,0.0,1.0,0.0,0.0,1.0,1.000000,1.000000,37.0,1.000000,0.0,0.000000
13,1773,6,0.0034,0.0035,0.0,1.0,0.0,0.0,1.0,1.017162,1.000000,37.0,1.017150,0.0,0.017017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
536717,711,4,0.0056,0.0057,0.0,28.0,1.0,1.0,1.0,1.008523,1.000000,37.0,1.008510,1.0,0.008487
536725,711,2,0.0028,0.0028,0.0,28.0,1.0,1.0,1.0,1.008499,1.000000,31.0,1.008490,1.0,0.008463
536733,711,2,0.0028,0.0029,1.0,28.0,1.0,1.0,1.0,1.005666,1.000000,37.0,1.005660,1.0,0.005650
536734,711,2,0.0028,0.0029,0.0,28.0,1.0,1.0,1.0,1.008499,1.000000,37.0,1.008490,1.0,0.008463


In [52]:
df1.describe()

Unnamed: 0,DP,VD,AF,HIAF,IMPACT,SYMBOL,sampleTimePt,gender,BIAS,REFBIAS,VARBIAS,QUAL,ODDRATIO,chipOrControl,REFBIAS_naturalLog
count,111514.0,111514.0,111514.0,111514.0,111514.0,111514.0,111514.0,111514.0,111514.0,111514.0,111514.0,111514.0,111514.0,111514.0,111514.0
mean,5738.085613,140.261922,0.011021,0.011052,0.052128,12.275885,0.523782,0.481841,0.963749,15.77326,0.988824,35.283367,1.675642,0.446392,-inf
std,13170.485678,2950.343617,0.067937,0.068319,0.222286,7.944495,0.499436,0.499672,0.184458,362.471265,2.225198,2.636294,45.541201,0.49712,
min,2.0,2.0,0.0025,0.0015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.5,0.0,0.0,-inf
25%,555.0,2.0,0.0028,0.0028,0.0,7.0,0.0,0.0,1.0,1.012605,1.0,34.4,1.01318,0.0,0.01252626
50%,1209.0,4.0,0.0035,0.0035,0.0,9.0,1.0,0.0,1.0,1.020056,1.0,37.0,1.02299,0.0,0.01985775
75%,5037.0,16.0,0.0052,0.0052,0.0,20.0,1.0,1.0,1.0,1.028777,1.0,37.0,1.04131,1.0,0.0283707
max,455487.0,319533.0,1.0,1.0,1.0,39.0,1.0,1.0,2.0,30215.0,724.5,37.0,14285.714286,1.0,10.31609


In [53]:
# With dataset 1
#upper
print("Above the median quantile")
print(df1["REFBIAS"].quantile(0.97))
print(df1["REFBIAS"].quantile(0.98))
print(df1["REFBIAS"].quantile(0.99))
print()
print(df1["REFBIAS_naturalLog"].quantile(0.97))
print(df1["REFBIAS_naturalLog"].quantile(0.98))
print(df1["REFBIAS_naturalLog"].quantile(0.99))
print()
#lower
print("Below the median quantile")
print(df1["REFBIAS"].quantile(0.1))
print(df1["REFBIAS"].quantile(0.05))
print(df1["REFBIAS"].quantile(0.01))
print()
print(df1["REFBIAS_naturalLog"].quantile(0.1))
print(df1["REFBIAS_naturalLog"].quantile(0.05))
print(df1["REFBIAS_naturalLog"].quantile(0.01))

Above the median quantile
1.0576923076923077
1.0690527335697206
54.97636363636279

0.056089466651043585
0.06677296055839184
4.00690272126298

Below the median quantile
1.0037735849056604
0.9946808510638298
0.0

0.0037664827954768648
-0.005333345975362617
-inf


In [54]:
#upper cut off at 0.067?
q = df1["REFBIAS_naturalLog"].quantile(0.98)
df1 = df1[df1["REFBIAS_naturalLog"] < q]
#lower cut off at 0.0038
q = df1["REFBIAS_naturalLog"].quantile(0.10)
df1 = df1[df1["REFBIAS_naturalLog"] > q]
df1.describe()

Unnamed: 0,DP,VD,AF,HIAF,IMPACT,SYMBOL,sampleTimePt,gender,BIAS,REFBIAS,VARBIAS,QUAL,ODDRATIO,chipOrControl,REFBIAS_naturalLog
count,98353.0,98353.0,98353.0,98353.0,98353.0,98353.0,98353.0,98353.0,98353.0,98353.0,98353.0,98353.0,98353.0,98353.0,98353.0
mean,6205.569642,81.073795,0.006401,0.006376,0.052088,12.179079,0.520543,0.481185,0.997926,1.023061,1.017476,35.244242,1.104391,0.443281,0.022736
std,13655.098214,1684.753488,0.030048,0.030128,0.222205,7.863109,0.49958,0.499648,0.045496,0.011456,0.341282,2.635327,0.390715,0.496775,0.011141
min,33.0,2.0,0.0025,0.0015,0.0,0.0,0.0,0.0,0.0,1.003521,0.0,22.5,0.0,0.0,0.003515
25%,601.0,2.0,0.0028,0.0028,0.0,7.0,0.0,0.0,1.0,1.014793,1.0,34.4,1.01573,0.0,0.014685
50%,1356.0,5.0,0.0034,0.0034,0.0,9.0,1.0,0.0,1.0,1.021186,1.0,37.0,1.0247,0.0,0.020965
75%,5701.0,18.0,0.0049,0.0049,0.0,20.0,1.0,1.0,1.0,1.02924,1.0,37.0,1.04299,1.0,0.02882
max,455487.0,137339.0,0.9902,0.9962,1.0,38.0,1.0,1.0,1.0,1.06903,18.0,37.0,17.780939,1.0,0.066752


In [55]:
df2.describe()

Unnamed: 0,DP,VD,AF,HIAF,IMPACT,SYMBOL,sampleTimePt,gender,BIAS,REFBIAS,VARBIAS,QUAL,ODDRATIO,chipOrControl,REFBIAS_naturalLog
count,157033.0,157033.0,157033.0,157033.0,157033.0,157033.0,157033.0,157033.0,157033.0,157033.0,157033.0,157033.0,157033.0,157033.0,157033.0
mean,2993.601905,53.156107,0.011415,0.011345,0.956168,14.827851,0.525234,0.496138,0.849844,9.747043,1.542255,32.155388,2.357794,0.478237,-inf
std,5145.917259,1076.017675,0.057943,0.059323,0.204721,9.381112,0.499364,0.499987,0.376606,160.293513,4.741635,4.414058,18.181966,0.499528,
min,2.0,2.0,0.0025,0.0015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.5,0.0,0.0,-inf
25%,729.0,3.0,0.0029,0.0028,1.0,8.0,0.0,0.0,1.0,0.862069,0.5,28.3,1.01657,0.0,-0.14842
50%,1567.0,6.0,0.0037,0.0037,1.0,13.0,1.0,0.0,1.0,0.979557,1.0,33.0,1.10014,0.0,-0.02065478
75%,3331.0,13.0,0.006,0.0058,1.0,23.0,1.0,1.0,1.0,1.026915,1.0,37.0,1.652,1.0,0.02655927
max,216036.0,173172.0,1.0,1.0,1.0,40.0,1.0,1.0,2.0,13870.0,575.0,37.0,3571.428571,1.0,9.537484


In [56]:
# With dataset 2
#upper
print("Above the median quantile")
print(df2["REFBIAS"].quantile(0.97))
print(df2["REFBIAS"].quantile(0.98))
print(df2["REFBIAS"].quantile(0.99))
print()
print(df2["REFBIAS_naturalLog"].quantile(0.97))
print(df2["REFBIAS_naturalLog"].quantile(0.98))
print(df2["REFBIAS_naturalLog"].quantile(0.99))
print()
#lower
print("Below the median quantile")
print(df2["REFBIAS"].quantile(0.1))
print(df2["REFBIAS"].quantile(0.05))
print(df2["REFBIAS"].quantile(0.01))
print()
print(df2["REFBIAS_naturalLog"].quantile(0.6))
print(df2["REFBIAS_naturalLog"].quantile(0.05))
print(df2["REFBIAS_naturalLog"].quantile(0.01))

Above the median quantile
1.1111111111111112
1.1422222626258425
12.266153846148743

0.10536051565782635
0.1329757179848386
2.5064518185870805

Below the median quantile
0.5623794086487293
0.23593930482230804
0.0

0.0009696972738023772
-1.4441806907359855
-inf


In [57]:
#upper cut off at 0.142?
q = df2["REFBIAS_naturalLog"].quantile(0.98)
df2 = df2[df2["REFBIAS_naturalLog"] < q]
#lower cut off at -1.444
q = df2["REFBIAS_naturalLog"].quantile(0.05)
df2 = df2[df2["REFBIAS_naturalLog"] > q]
df2.describe()

Unnamed: 0,DP,VD,AF,HIAF,IMPACT,SYMBOL,sampleTimePt,gender,BIAS,REFBIAS,VARBIAS,QUAL,ODDRATIO,chipOrControl,REFBIAS_naturalLog
count,146197.0,146197.0,146197.0,146197.0,146197.0,146197.0,146197.0,146197.0,146197.0,146197.0,146197.0,146197.0,146197.0,146197.0,146197.0
mean,3055.650232,36.680301,0.009267,0.00907,0.955649,14.84471,0.525072,0.496679,0.887576,0.928065,1.624989,32.122261,2.176161,0.478006,-0.09572
std,5203.571381,612.635943,0.041306,0.042181,0.205875,9.441809,0.499373,0.499991,0.337901,0.159505,4.729975,4.40141,8.748302,0.499518,0.226791
min,4.0,2.0,0.0025,0.0015,0.0,0.0,0.0,0.0,0.0,0.226827,0.0,22.5,0.0,0.0,-1.483569
25%,738.0,3.0,0.0029,0.0028,1.0,8.0,0.0,0.0,1.0,0.885196,0.666667,28.3,1.02372,0.0,-0.121946
50%,1601.0,6.0,0.0037,0.0037,1.0,13.0,1.0,0.0,1.0,0.983389,1.0,32.8,1.11465,0.0,-0.016751
75%,3435.0,13.0,0.006,0.0058,1.0,23.0,1.0,1.0,1.0,1.026415,1.083333,37.0,1.685573,1.0,0.026072
max,216036.0,69194.0,0.9869,1.0,1.0,40.0,1.0,1.0,2.0,1.142212,575.0,37.0,1234.567901,1.0,0.132966


## `DP` AND `VD` OUTLIERS

In [58]:
(df1.describe())

Unnamed: 0,DP,VD,AF,HIAF,IMPACT,SYMBOL,sampleTimePt,gender,BIAS,REFBIAS,VARBIAS,QUAL,ODDRATIO,chipOrControl,REFBIAS_naturalLog
count,98353.0,98353.0,98353.0,98353.0,98353.0,98353.0,98353.0,98353.0,98353.0,98353.0,98353.0,98353.0,98353.0,98353.0,98353.0
mean,6205.569642,81.073795,0.006401,0.006376,0.052088,12.179079,0.520543,0.481185,0.997926,1.023061,1.017476,35.244242,1.104391,0.443281,0.022736
std,13655.098214,1684.753488,0.030048,0.030128,0.222205,7.863109,0.49958,0.499648,0.045496,0.011456,0.341282,2.635327,0.390715,0.496775,0.011141
min,33.0,2.0,0.0025,0.0015,0.0,0.0,0.0,0.0,0.0,1.003521,0.0,22.5,0.0,0.0,0.003515
25%,601.0,2.0,0.0028,0.0028,0.0,7.0,0.0,0.0,1.0,1.014793,1.0,34.4,1.01573,0.0,0.014685
50%,1356.0,5.0,0.0034,0.0034,0.0,9.0,1.0,0.0,1.0,1.021186,1.0,37.0,1.0247,0.0,0.020965
75%,5701.0,18.0,0.0049,0.0049,0.0,20.0,1.0,1.0,1.0,1.02924,1.0,37.0,1.04299,1.0,0.02882
max,455487.0,137339.0,0.9902,0.9962,1.0,38.0,1.0,1.0,1.0,1.06903,18.0,37.0,17.780939,1.0,0.066752


In [59]:
print("DATASET 1")
print("DEPTH")
print("DP at 97% quantile: " + str(df1.DP.quantile(0.97)))
print("DP at 98% quantile: " + str(df1.DP.quantile(0.98)))
print("DP at 99% quantile: " + str(df1.DP.quantile(0.99)))
print("VARIANT DEPTH")
print("VD at 97% quantile: " + str(df1.VD.quantile(0.97)))
print("VD at 98% quantile: " + str(df1.VD.quantile(0.98)))
print("VD at 99% quantile: " + str(df1.VD.quantile(0.99)))

print()

print("DATASET 2")
print("DEPTH")
print("DP at 97% quantile: " + str(df2.DP.quantile(0.97)))
print("DP at 98% quantile: " + str(df2.DP.quantile(0.98)))
print("DP at 99% quantile: " + str(df2.DP.quantile(0.99)))
print("VARIANT DEPTH")
print("VD at 97% quantile: " + str(df2.VD.quantile(0.97)))
print("VD at 98% quantile: " + str(df2.VD.quantile(0.98)))
print("VD at 99% quantile: " + str(df2.VD.quantile(0.99)))

DATASET 1
DEPTH
DP at 97% quantile: 37304.0
DP at 98% quantile: 46929.55999999991
DP at 99% quantile: 65636.0
VARIANT DEPTH
VD at 97% quantile: 119.0
VD at 98% quantile: 154.0
VD at 99% quantile: 232.0

DATASET 2
DEPTH
DP at 97% quantile: 13154.0
DP at 98% quantile: 16047.159999999974
DP at 99% quantile: 21929.520000000106
VARIANT DEPTH
VD at 97% quantile: 92.0
VD at 98% quantile: 135.0
VD at 99% quantile: 266.0


In [60]:
df1['VD_naturalLog'] = np.log(df1['VD'])
df2['VD_naturalLog'] = np.log(df2['VD'])
df1

Unnamed: 0,DP,VD,AF,HIAF,IMPACT,SYMBOL,sampleTimePt,gender,BIAS,REFBIAS,VARBIAS,QUAL,ODDRATIO,chipOrControl,REFBIAS_naturalLog,VD_naturalLog
0,7281,26,0.0036,0.0033,0.0,0.0,0.0,0.0,1.0,1.013348,0.857143,34.0,1.182210,0.0,0.013260,3.258097
4,7282,29,0.0040,0.0036,0.0,0.0,0.0,0.0,1.0,1.009151,1.071429,32.7,1.061729,0.0,0.009110,3.367296
8,7282,24,0.0033,0.0033,0.0,0.0,0.0,0.0,1.0,1.005268,1.000000,34.9,1.005270,0.0,0.005254,3.178054
13,1773,6,0.0034,0.0035,0.0,1.0,0.0,0.0,1.0,1.017162,1.000000,37.0,1.017150,0.0,0.017017,1.791759
14,1773,7,0.0039,0.0029,0.0,1.0,0.0,0.0,1.0,1.020619,0.750000,29.6,1.360580,0.0,0.020409,1.945910
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
536717,711,4,0.0056,0.0057,0.0,28.0,1.0,1.0,1.0,1.008523,1.000000,37.0,1.008510,1.0,0.008487,1.386294
536725,711,2,0.0028,0.0028,0.0,28.0,1.0,1.0,1.0,1.008499,1.000000,31.0,1.008490,1.0,0.008463,0.693147
536733,711,2,0.0028,0.0029,1.0,28.0,1.0,1.0,1.0,1.005666,1.000000,37.0,1.005660,1.0,0.005650,0.693147
536734,711,2,0.0028,0.0029,0.0,28.0,1.0,1.0,1.0,1.008499,1.000000,37.0,1.008490,1.0,0.008463,0.693147


In [61]:
# With dataset 1
#upper
print("Above the median quantile")
print(df1.VD_naturalLog.quantile(0.97))
print(df1.VD_naturalLog.quantile(0.98))
print(df1.VD_naturalLog.quantile(0.99))
print()
print(df1.VD.quantile(0.97))
print(df1.VD.quantile(0.98))
print(df1.VD.quantile(0.99))

print()

#Lower
print("Below the median quantile")
print(df1.VD_naturalLog.quantile(0.1))
print(df1.VD_naturalLog.quantile(0.05))
print(df1.VD_naturalLog.quantile(0.01))
print()
print(df1.VD.quantile(0.1))
print(df1.VD.quantile(0.05))
print(df1.VD.quantile(0.01))

Above the median quantile
4.77912349311153
5.0369526024136295
5.44673737166631

119.0
154.0
232.0

Below the median quantile
0.6931471805599454
0.6931471805599453
0.6931471805599453

2.0
2.0
2.0


In [62]:
#cut off at 4.779?
q = df1["VD_naturalLog"].quantile(0.97)
df1 = df1[df1["VD_naturalLog"] < q]

q = df1["VD_naturalLog"].quantile(0.05)
df1 = df1[df1["VD_naturalLog"] > q]
df1.describe()

Unnamed: 0,DP,VD,AF,HIAF,IMPACT,SYMBOL,sampleTimePt,gender,BIAS,REFBIAS,VARBIAS,QUAL,ODDRATIO,chipOrControl,REFBIAS_naturalLog,VD_naturalLog
count,66512.0,66512.0,66512.0,66512.0,66512.0,66512.0,66512.0,66512.0,66512.0,66512.0,66512.0,66512.0,66512.0,66512.0,66512.0,66512.0
mean,6263.097456,19.477628,0.00431,0.004198,0.037858,12.343547,0.494648,0.482605,0.998436,1.022408,1.027591,34.861045,1.14082,0.436718,0.022109,2.421584
std,8061.98402,22.910292,0.006591,0.006659,0.190854,7.870467,0.499975,0.499701,0.039512,0.010429,0.406904,2.716605,0.461066,0.495983,0.010154,1.009766
min,41.0,3.0,0.0025,0.0015,0.0,0.0,0.0,0.0,0.0,1.003521,0.0,22.5,0.0,0.0,0.003515,1.098612
25%,1163.0,4.0,0.0027,0.0027,0.0,7.0,0.0,0.0,1.0,1.014953,1.0,34.0,1.01678,0.0,0.014843,1.386294
50%,2501.0,8.0,0.0032,0.0031,0.0,10.0,0.0,0.0,1.0,1.020875,1.0,35.8,1.02612,0.0,0.02066,2.079442
75%,8330.0,25.0,0.0044,0.0042,0.0,20.0,1.0,1.0,1.0,1.02805,1.0,37.0,1.066058,1.0,0.027664,3.218876
max,47636.0,118.0,0.6434,0.6434,1.0,38.0,1.0,1.0,1.0,1.06903,18.0,37.0,17.780939,1.0,0.066752,4.770685


In [63]:
# With dataset 2
#upper
print("Above the median quantile")
print(df2.VD_naturalLog.quantile(0.97))
print(df2.VD_naturalLog.quantile(0.98))
print(df2.VD_naturalLog.quantile(0.99))
print()
print(df2.VD.quantile(0.97))
print(df2.VD.quantile(0.98))
print(df2.VD.quantile(0.99))

print()

#Lower
print("Below the median quantile")
print(df2.VD_naturalLog.quantile(0.1))
print(df2.VD_naturalLog.quantile(0.05))
print(df2.VD_naturalLog.quantile(0.01))
print()
print(df2.VD.quantile(0.1))
print(df2.VD.quantile(0.05))
print(df2.VD.quantile(0.01))

Above the median quantile
4.5217885770490405
4.90527477843843
5.583496308781699

92.0
135.0
266.0

Below the median quantile
0.6931471805599453
0.6931471805599453
0.6931471805599453

2.0
2.0
2.0


In [64]:
#cut off at 4.522?
q = df2["VD_naturalLog"].quantile(0.97)
df2 = df2[df2["VD_naturalLog"] < q]

q = df2["VD_naturalLog"].quantile(0.05)
df2 = df2[df2["VD_naturalLog"] > q]
df2.describe()

Unnamed: 0,DP,VD,AF,HIAF,IMPACT,SYMBOL,sampleTimePt,gender,BIAS,REFBIAS,VARBIAS,QUAL,ODDRATIO,chipOrControl,REFBIAS_naturalLog,VD_naturalLog
count,141798.0,141798.0,141798.0,141798.0,141798.0,141798.0,141798.0,141798.0,141798.0,141798.0,141798.0,141798.0,141798.0,141798.0,141798.0,141798.0
mean,2658.359159,10.93715,0.006603,0.00645,0.954872,14.794588,0.527878,0.495479,0.883327,0.927995,1.398141,32.239969,1.85146,0.479189,-0.095913,1.922337
std,3289.263874,13.154869,0.015518,0.016265,0.207585,9.460895,0.499224,0.499981,0.341139,0.159824,2.257818,4.353255,2.950377,0.499568,0.227515,0.923404
min,4.0,2.0,0.0025,0.0015,0.0,0.0,0.0,0.0,0.0,0.226827,0.0,22.5,0.0,0.0,-1.483569,0.693147
25%,720.0,3.0,0.0029,0.0028,1.0,8.0,0.0,0.0,1.0,0.8854,0.666667,28.3,1.022674,0.0,-0.121716,1.098612
50%,1550.0,6.0,0.0037,0.0036,1.0,13.0,1.0,0.0,1.0,0.983506,1.0,33.0,1.107119,0.0,-0.016631,1.791759
75%,3219.0,12.0,0.0056,0.0055,1.0,23.0,1.0,1.0,1.0,1.026688,1.0,37.0,1.6032,1.0,0.026338,2.484907
max,36590.0,91.0,0.8136,0.8727,1.0,40.0,1.0,1.0,2.0,1.142212,86.0,37.0,173.913043,1.0,0.132966,4.51086


In [65]:
df1.drop(['REFBIAS_naturalLog', 'VD_naturalLog', 'AF'], axis = 1)

Unnamed: 0,DP,VD,HIAF,IMPACT,SYMBOL,sampleTimePt,gender,BIAS,REFBIAS,VARBIAS,QUAL,ODDRATIO,chipOrControl
0,7281,26,0.0033,0.0,0.0,0.0,0.0,1.0,1.013348,0.857143,34.0,1.182210,0.0
4,7282,29,0.0036,0.0,0.0,0.0,0.0,1.0,1.009151,1.071429,32.7,1.061729,0.0
8,7282,24,0.0033,0.0,0.0,0.0,0.0,1.0,1.005268,1.000000,34.9,1.005270,0.0
13,1773,6,0.0035,0.0,1.0,0.0,0.0,1.0,1.017162,1.000000,37.0,1.017150,0.0
14,1773,7,0.0029,0.0,1.0,0.0,0.0,1.0,1.020619,0.750000,29.6,1.360580,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
536636,1698,6,0.0036,0.0,28.0,1.0,1.0,1.0,1.026442,1.000000,35.0,1.026430,1.0
536645,710,6,0.0086,0.0,28.0,1.0,1.0,1.0,1.011429,1.000000,37.0,1.011410,1.0
536660,710,4,0.0057,0.0,28.0,1.0,1.0,1.0,1.014286,1.000000,34.0,1.014270,1.0
536693,710,4,0.0043,0.0,28.0,1.0,1.0,1.0,1.014286,1.000000,30.5,1.014270,1.0


In [68]:
df2.drop(['REFBIAS_naturalLog', 'VD_naturalLog', 'AF'], axis = 1)

Unnamed: 0,DP,VD,HIAF,IMPACT,SYMBOL,sampleTimePt,gender,BIAS,REFBIAS,VARBIAS,QUAL,ODDRATIO,chipOrControl
0,2036,6,0.0044,0.0,0.0,0.0,0.0,1.0,0.957230,1.000000,35.0,1.044670,0.0
5,2036,7,0.0035,1.0,0.0,0.0,0.0,1.0,0.988224,0.750000,35.3,1.317450,0.0
9,2040,5,0.0021,1.0,0.0,0.0,0.0,1.0,0.988166,0.666667,31.8,1.481960,0.0
13,2041,6,0.0025,1.0,0.0,0.0,0.0,1.0,0.983415,1.000000,32.7,1.016860,0.0
17,2037,5,0.0028,1.0,0.0,0.0,0.0,1.0,0.894942,0.666667,32.2,1.342210,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
693007,4818,39,0.0062,1.0,31.0,1.0,1.0,1.0,1.027062,3.875000,26.5,3.771735,1.0
693014,4820,16,0.0025,1.0,31.0,1.0,1.0,1.0,1.019459,0.454545,25.0,2.242460,1.0
693021,4805,13,0.0027,1.0,31.0,1.0,1.0,1.0,1.000839,0.857143,35.0,1.167610,1.0
693035,4788,14,0.0030,1.0,31.0,1.0,1.0,1.0,0.998316,1.000000,37.0,1.001683,1.0


## Tuning Hyperparameters

In [66]:
X = df1.drop('chipOrControl', axis = 1) # drop the target variable for the features
y = df1['chipOrControl'] # create a target dataframe
param_grid = {'n_estimators': [40, 50, 60], 'min_samples_split': [40, 50, 60, 70], 'min_samples_leaf': [12, 13, 14, 15, 16, 17], 
              'max_features': ['auto'], 'max_depth': [3, 4, 5, 6], 'criterion': ['gini'], 'bootstrap': [False]}

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state = 0)

knn = KNeighborsClassifier(n_neighbors = 3, weights= 'distance')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print('Accuracy {0}'.format(np.round(accuracy_score(y_test, y_pred),3)))

Accuracy 0.778


In [69]:
X = df2.drop('chipOrControl', axis = 1) # drop the target variable for the features
y = df2['chipOrControl'] # create a target dataframe
param_grid = {'n_estimators': [40, 50, 60], 'min_samples_split': [40, 50, 60, 70], 'min_samples_leaf': [12, 13, 14, 15, 16, 17], 
              'max_features': ['auto'], 'max_depth': [3, 4, 5, 6], 'criterion': ['gini'], 'bootstrap': [False]}

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state = 0)

knn = KNeighborsClassifier(n_neighbors = 3, weights= 'distance')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print('Accuracy {0}'.format(np.round(accuracy_score(y_test, y_pred),3)))

Accuracy 0.64
