In [13]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split # for splitting the data into training and testing sets
from sklearn.linear_model import LinearRegression, Ridge # models we are going to use
from sklearn.metrics import r2_score # for comparing the predicted and test values
import seaborn as sns;

### WITH THE FIRST DATASET

In [19]:
df1 = pd.read_csv("chipVariantCalling_run1.tsv", sep = '\t')
df1.head()

Unnamed: 0,d.barcode,DP,VD,AF,HIAF,IMPACT,SYMBOL,loci,sampleTimePt,gender,MSID,BIAS,REFBIAS,VARBIAS,QUAL,ODDRATIO,chipOrControl
0,4010289633,7281,26,0.0036,0.0033,MODERATE,GNB1,chr1:1747196_T/C,Baseline,Male,MS2083,2:2,3644:3596,12:14,34.0,1.18221,CHIP
1,4010289633,7281,26,0.0036,0.0033,MODERATE,GNB1,chr1:1747196_T/C,Baseline,Male,MS2083,2:2,3644:3596,12:14,34.0,1.18221,CHIP
2,4010289633,7281,26,0.0036,0.0033,MODERATE,GNB1,chr1:1747196_T/C,Baseline,Male,MS2083,2:2,3644:3596,12:14,34.0,1.18221,CHIP
3,4010289633,7281,26,0.0036,0.0033,MODERATE,GNB1,chr1:1747196_T/C,Baseline,Male,MS2083,2:2,3644:3596,12:14,34.0,1.18221,CHIP
4,4010289633,7282,29,0.004,0.0036,MODERATE,GNB1,chr1:1747250_T/C,Baseline,Male,MS2083,2:2,3639:3606,15:14,32.7,1.061729,CHIP


In [20]:
#drop unnecessary columns
df1 = df1[df1.chipOrControl != 'Blank']
df1 = df1[df1.chipOrControl != 'Unknown']
df1 = df1.drop_duplicates()
df1 = df1.dropna(subset=['chipOrControl'])
df1.drop(['d.barcode', 'IMPACT', 'gender'], axis=1, inplace=True)

In [21]:
df1 = df1.replace({'chipOrControl': {'CHIP': 1, 'Control': 0}})

df1.head()

Unnamed: 0,DP,VD,AF,HIAF,SYMBOL,loci,sampleTimePt,MSID,BIAS,REFBIAS,VARBIAS,QUAL,ODDRATIO,chipOrControl
0,7281,26,0.0036,0.0033,GNB1,chr1:1747196_T/C,Baseline,MS2083,2:2,3644:3596,12:14,34.0,1.18221,1
4,7282,29,0.004,0.0036,GNB1,chr1:1747250_T/C,Baseline,MS2083,2:2,3639:3606,15:14,32.7,1.061729,1
8,7282,24,0.0033,0.0033,GNB1,chr1:1747256_T/C,Baseline,MS2083,2:2,3626:3607,12:12,34.9,1.00527,1
12,178,2,0.0112,0.012,NRAS,chr1:115256571_T/C,Baseline,MS2083,2:2,88:88,1:1,37.0,1.0,1
13,1773,6,0.0034,0.0035,NRAS,chr1:115258674_T/C,Baseline,MS2083,2:2,889:874,3:3,37.0,1.01715,1


In [22]:
#converting reference bias and variable bias to float types
def ratio_to_int(string):
    a, b = string.split(":")
    if int(b) == 0:
        return 0
    else:
        return int(a) / int(b)

refbias = []
varbias = []

for ratio in df1.REFBIAS.array:
    refbias.append(ratio_to_int(str(ratio))) 

for ratio in df1.VARBIAS.array:
    varbias.append(ratio_to_int(str(ratio))) 
    
refbias = pd.Series(refbias)
varbias = pd.Series(varbias)

df1['REFBIAS'] = refbias.values
df1['VARBIAS'] = varbias.values

In [23]:
df1

Unnamed: 0,DP,VD,AF,HIAF,SYMBOL,loci,sampleTimePt,MSID,BIAS,REFBIAS,VARBIAS,QUAL,ODDRATIO,chipOrControl
0,7281,26,0.0036,0.0033,GNB1,chr1:1747196_T/C,Baseline,MS2083,2:2,1.013348,0.857143,34.0,1.182210,1
4,7282,29,0.0040,0.0036,GNB1,chr1:1747250_T/C,Baseline,MS2083,2:2,1.009151,1.071429,32.7,1.061729,1
8,7282,24,0.0033,0.0033,GNB1,chr1:1747256_T/C,Baseline,MS2083,2:2,1.005268,1.000000,34.9,1.005270,1
12,178,2,0.0112,0.0120,NRAS,chr1:115256571_T/C,Baseline,MS2083,2:2,1.000000,1.000000,37.0,1.000000,1
13,1773,6,0.0034,0.0035,NRAS,chr1:115258674_T/C,Baseline,MS2083,2:2,1.017162,1.000000,37.0,1.017150,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
536725,711,2,0.0028,0.0028,BRCC3,chrX:154305513_C/T,Y3,MS6625,2:2,1.008499,1.000000,31.0,1.008490,0
536726,711,2,0.0028,0.0029,BRCC3,chrX:154305518_T/C,Y3,MS6625,2:2,1.005666,1.000000,37.0,1.005660,0
536733,711,2,0.0028,0.0029,BRCC3,chrX:154305518_T/C,Y3,MS6625,2:2,1.005666,1.000000,37.0,1.005660,0
536734,711,2,0.0028,0.0029,BRCC3,chrX:154305533_A/T,Y3,MS6625,2:2,1.008499,1.000000,37.0,1.008490,0


Twp different dataframes:

`actual_chip` - contains all the values with `AF`>= 0.02 (by contemporary definition) and are classified as CHIP

`false_control` - contains all the values with `AF`>= 0.02 but are classified as Control

In [24]:
allele_fq = df1[df1["AF"] >= 0.02]
actual_chip = allele_fq[allele_fq["chipOrControl"] == 1]
false_control = allele_fq[allele_fq["chipOrControl"] == 0]
actual_chip

Unnamed: 0,DP,VD,AF,HIAF,SYMBOL,loci,sampleTimePt,MSID,BIAS,REFBIAS,VARBIAS,QUAL,ODDRATIO,chipOrControl
276,80,2,0.0250,0.0260,DNMT3A,chr2:25467154_G/A,Baseline,MS2083,2:2,1.166667,1.000000,37.0,1.164410,1
716,4201,2113,0.5030,0.5025,TET2,chr4:106155620_C/A,Baseline,MS2083,2:2,1.010628,1.018147,35.8,1.007435,1
1332,2,2,1.0000,1.0000,TET2,chr4:106157527_C/T,Baseline,MS2083,0:2,0.000000,1.000000,31.0,0.000000,1
1444,50,3,0.0600,0.0435,TET2,chr4:106158252_G/T,Baseline,MS2083,2:2,1.043478,0.500000,28.3,2.057400,1
1452,51,2,0.0392,0.0417,TET2,chr4:106158331_A/G,Baseline,MS2083,2:2,0.960000,1.000000,37.0,1.040843,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
526423,96274,47654,0.4950,0.4966,SNX18,chr5:53815495_A/C,Y3,MS14472,2:2,1.015994,1.012586,36.2,1.003370,1
526738,1681,1668,0.9923,0.9958,TP53,chr17:7579472_G/C,Y3,MS14472,2:2,1.000000,1.016929,36.3,1.016922,1
526776,53,2,0.0377,0.0385,SRSF2,chr17:74733034_T/C,Y3,MS14472,2:2,1.040000,1.000000,37.0,1.039240,1
526835,25218,24992,0.9910,0.9959,ASXL1,chr20:31022959_T/C,Y3,MS14472,2:2,1.000000,1.015809,35.8,1.015806,1


In [25]:
false_control

Unnamed: 0,DP,VD,AF,HIAF,SYMBOL,loci,sampleTimePt,MSID,BIAS,REFBIAS,VARBIAS,QUAL,ODDRATIO,chipOrControl
11953,18320,516,0.0282,0.0286,DNMT3A,chr2:25457256_GACGTC/G,Baseline,MS14128,2:2,1.012819,1.015625,36.9,1.002768,0
12213,144,4,0.0278,0.0280,DNMT3A,chr2:25464507_G/A,Baseline,MS14128,2:2,1.028986,1.000000,37.0,1.028790,0
12303,16,2,0.1250,0.1250,DNMT3A,chr2:25468147_A/G,Baseline,MS14128,2:2,1.000000,1.000000,31.0,1.000000,0
12775,408,215,0.5270,0.5179,TET2,chr4:106155751_G/A,Baseline,MS14128,2:2,1.010526,1.067308,34.6,1.056022,0
13759,163,4,0.0245,0.0191,TET2,chr4:106157812_G/A,Baseline,MS14128,2:2,0.987342,1.000000,27.5,1.012740,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
536378,179,4,0.0223,0.0227,ASXL1,chr20:31022811_T/C,Y3,MS6625,2:2,1.011494,1.000000,37.0,1.011430,0
536382,6820,6749,0.9896,0.9962,ASXL1,chr20:31022959_T/C,Y3,MS6625,2:2,1.000000,1.020054,35.7,1.020054,0
536400,178,4,0.0225,0.0225,ASXL1,chr20:31023305_G/A,Y3,MS6625,2:2,1.000000,1.000000,37.0,1.000000,0
536482,53257,26575,0.4990,0.4976,LIPI,chr21:15481365_G/T,Y3,MS6625,2:2,1.011299,1.011277,35.5,1.000000,0


In [26]:
print("ACTUAL_CHIP DESCRIPTION")
print(" ")
print(actual_chip.describe())
print(" ")
print("FALSE_CONTROL DESCRIPTION")
print(" ")
print(false_control.describe())

ACTUAL_CHIP DESCRIPTION
 
                  DP             VD           AF         HIAF      REFBIAS  \
count    1941.000000    1941.000000  1941.000000  1941.000000  1941.000000   
mean     5730.137043    3782.135497     0.197759     0.199533     1.098606   
std     22055.169437   16979.919135     0.312914     0.314402     3.740221   
min         2.000000       2.000000     0.020000     0.013900     0.000000   
25%        50.000000       2.000000     0.025000     0.025300     1.000000   
50%        89.000000       2.000000     0.040800     0.041700     1.000000   
75%       250.000000       9.000000     0.142900     0.150200     1.030303   
max    324189.000000  319533.000000     1.000000     1.000000   104.666667   

           VARBIAS         QUAL      ODDRATIO  chipOrControl  
count  1941.000000  1941.000000   1941.000000         1941.0  
mean      1.386735    35.952550      8.543060            1.0  
std      16.476665     2.157903    324.243488            0.0  
min       0.000000 

### WITH THE SECOND DATASET

In [31]:
df2 = pd.read_csv("chipVariantCalling_run2.tsv", sep = '\t', low_memory = False)
df2.head()

Unnamed: 0,d.barcode,DP,VD,AF,HIAF,IMPACT,SYMBOL,loci,sampleTimePt,gender,MSID,BIAS,REFBIAS,VARBIAS,QUAL,ODDRATIO,chipOrControl
0,4010289633,2036,6,0.0029,0.0044,HIGH,GNB1,chr1:1747193_A/G,Baseline,Male,MS2083,2:2,940:982,3:3,35.0,1.04467,CHIP
1,4010289633,2036,6,0.0029,0.0044,HIGH,GNB1,chr1:1747193_A/G,Baseline,Male,MS2083,2:2,940:982,3:3,35.0,1.04467,CHIP
2,4010289633,2036,6,0.0029,0.0044,HIGH,GNB1,chr1:1747193_A/G,Baseline,Male,MS2083,2:2,940:982,3:3,35.0,1.04467,CHIP
3,4010289633,2036,6,0.0029,0.0044,HIGH,GNB1,chr1:1747193_A/G,Baseline,Male,MS2083,2:2,940:982,3:3,35.0,1.04467,CHIP
4,4010289633,2036,6,0.0029,0.0044,HIGH,GNB1,chr1:1747193_A/G,Baseline,Male,MS2083,2:2,940:982,3:3,35.0,1.04467,CHIP


In [32]:
#drop unnecessary columns
df2 = df2[df2.chipOrControl != 'Blank']
df2 = df2[df2.chipOrControl != 'Unknown']
df2 = df2.drop_duplicates()
df2 = df2.dropna(subset=['chipOrControl'])
df2.drop(['d.barcode', 'IMPACT', 'gender'], axis=1, inplace=True)

In [33]:
df2 = df2.replace({'chipOrControl': {'CHIP': 1, 'Control': 0}})

df2.head()

Unnamed: 0,DP,VD,AF,HIAF,SYMBOL,loci,sampleTimePt,MSID,BIAS,REFBIAS,VARBIAS,QUAL,ODDRATIO,chipOrControl
0,2036,6,0.0029,0.0044,GNB1,chr1:1747193_A/G,Baseline,MS2083,2:2,940:982,3:3,35.0,1.04467,1
5,2036,7,0.0034,0.0035,GNB1,chr1:1747198_G/A,Baseline,MS2083,2:2,1007:1019,3:4,35.3,1.31745,1
9,2040,5,0.0025,0.0021,GNB1,chr1:1747220_C/T,Baseline,MS2083,2:2,1002:1014,2:3,31.8,1.48196,1
13,2041,6,0.0029,0.0025,GNB1,chr1:1747238_G/A,Baseline,MS2083,2:2,1008:1025,3:3,32.7,1.01686,1
17,2037,5,0.0025,0.0028,GNB1,chr1:1747256_T/C,Baseline,MS2083,2:2,920:1028,2:3,32.2,1.34221,1


In [34]:
#converting reference bias and variable bias to float types
def ratio_to_int(string):
    a, b = string.split(":")
    if int(b) == 0:
        return 0
    else:
        return int(a) / int(b)

refbias = []
varbias = []

for ratio in df2.REFBIAS.array:
    refbias.append(ratio_to_int(str(ratio))) 

for ratio in df2.VARBIAS.array:
    varbias.append(ratio_to_int(str(ratio))) 
    
refbias = pd.Series(refbias)
varbias = pd.Series(varbias)

df2['REFBIAS'] = refbias.values
df2['VARBIAS'] = varbias.values

In [35]:
df2

Unnamed: 0,DP,VD,AF,HIAF,SYMBOL,loci,sampleTimePt,MSID,BIAS,REFBIAS,VARBIAS,QUAL,ODDRATIO,chipOrControl
0,2036,6,0.0029,0.0044,GNB1,chr1:1747193_A/G,Baseline,MS2083,2:2,0.957230,1.000000,35.0,1.044670,1
5,2036,7,0.0034,0.0035,GNB1,chr1:1747198_G/A,Baseline,MS2083,2:2,0.988224,0.750000,35.3,1.317450,1
9,2040,5,0.0025,0.0021,GNB1,chr1:1747220_C/T,Baseline,MS2083,2:2,0.988166,0.666667,31.8,1.481960,1
13,2041,6,0.0029,0.0025,GNB1,chr1:1747238_G/A,Baseline,MS2083,2:2,0.983415,1.000000,32.7,1.016860,1
17,2037,5,0.0025,0.0028,GNB1,chr1:1747256_T/C,Baseline,MS2083,2:2,0.894942,0.666667,32.2,1.342210,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
693014,4820,16,0.0033,0.0025,BRCC3,chrX:154306961_T/C,Y3,MS6625,2:2,1.019459,0.454545,25.0,2.242460,0
693021,4805,13,0.0027,0.0027,BRCC3,chrX:154306969_T/C,Y3,MS6625,2:2,1.000839,0.857143,35.0,1.167610,0
693028,4788,133,0.0278,0.0199,BRCC3,chrX:154306974_T/G,Y3,MS6625,2:2,0.949095,10.083333,27.2,10.620221,0
693035,4788,14,0.0029,0.0030,BRCC3,chrX:154306976_T/C,Y3,MS6625,2:2,0.998316,1.000000,37.0,1.001683,0


Twp different dataframes:

`actual_chip` - contains all the values with `AF`>= 0.02 (by contemporary definition) and are classified as CHIP

`false_control` - contains all the values with `AF`>= 0.02 but are classified as Control

In [36]:
allele_fq_2 = df2[df2["AF"] < 0.02]
actual_chip2 = allele_fq_2[allele_fq_2["chipOrControl"] == 1]
false_control2 = allele_fq_2[allele_fq_2["chipOrControl"] == 0]
actual_chip2

Unnamed: 0,DP,VD,AF,HIAF,SYMBOL,loci,sampleTimePt,MSID,BIAS,REFBIAS,VARBIAS,QUAL,ODDRATIO,chipOrControl
0,2036,6,0.0029,0.0044,GNB1,chr1:1747193_A/G,Baseline,MS2083,2:2,0.957230,1.000000,35.0,1.044670,1
5,2036,7,0.0034,0.0035,GNB1,chr1:1747198_G/A,Baseline,MS2083,2:2,0.988224,0.750000,35.3,1.317450,1
9,2040,5,0.0025,0.0021,GNB1,chr1:1747220_C/T,Baseline,MS2083,2:2,0.988166,0.666667,31.8,1.481960,1
13,2041,6,0.0029,0.0025,GNB1,chr1:1747238_G/A,Baseline,MS2083,2:2,0.983415,1.000000,32.7,1.016860,1
17,2037,5,0.0025,0.0028,GNB1,chr1:1747256_T/C,Baseline,MS2083,2:2,0.894942,0.666667,32.2,1.342210,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
679619,5716,16,0.0028,0.0028,BRCC3,chrX:154305536_AGCT/CGCG,Y3,MS14472,2:1,0.838986,0.000000,36.8,0.000000,1
679626,5694,23,0.0040,0.0042,BRCC3,chrX:154305539_TGT/GGG,Y3,MS14472,2:1,0.730211,0.000000,36.8,0.000000,1
679633,32479,85,0.0026,0.0017,BRCC3,chrX:154306943_C/A,Y3,MS14472,2:2,1.020416,2.695652,24.2,2.641799,1
679647,32072,330,0.0103,0.0073,BRCC3,chrX:154306985_A/C,Y3,MS14472,2:2,0.967013,18.411765,27.0,19.040366,1


In [37]:
false_control2

Unnamed: 0,DP,VD,AF,HIAF,SYMBOL,loci,sampleTimePt,MSID,BIAS,REFBIAS,VARBIAS,QUAL,ODDRATIO,chipOrControl
14554,5989,15,0.0025,0.0019,GNB1,chr1:1747228_T/C,Baseline,MS14128,2:2,0.971268,0.875000,30.1,1.109990,0
14558,5985,19,0.0032,0.0054,GNB1,chr1:1747246_AGT/CGG,Baseline,MS14128,2:1,0.462071,0.000000,34.3,0.000000,0
14562,5979,16,0.0027,0.0027,GNB1,chr1:1747256_T/C,Baseline,MS14128,2:2,0.882217,1.000000,32.2,1.133478,0
14566,5962,15,0.0025,0.0026,GNB1,chr1:1747279_A/G,Baseline,MS14128,2:2,0.951736,1.142857,34.5,1.200768,0
14570,661,3,0.0045,0.0032,NRAS,chr1:115256491_T/C,Baseline,MS14128,2:2,0.969970,0.500000,28.3,1.938060,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
693007,4818,39,0.0081,0.0062,BRCC3,chrX:154306957_A/T,Y3,MS6625,2:2,1.027062,3.875000,26.5,3.771735,0
693014,4820,16,0.0033,0.0025,BRCC3,chrX:154306961_T/C,Y3,MS6625,2:2,1.019459,0.454545,25.0,2.242460,0
693021,4805,13,0.0027,0.0027,BRCC3,chrX:154306969_T/C,Y3,MS6625,2:2,1.000839,0.857143,35.0,1.167610,0
693035,4788,14,0.0029,0.0030,BRCC3,chrX:154306976_T/C,Y3,MS6625,2:2,0.998316,1.000000,37.0,1.001683,0


In [38]:
print("ACTUAL_CHIP DESCRIPTION")
print(" ")
print(actual_chip2.describe())
print(" ")
print("FALSE_CONTROL DESCRIPTION")
print(" ")
print(false_control2.describe())

ACTUAL_CHIP DESCRIPTION
 
                  DP            VD            AF          HIAF       REFBIAS  \
count   77396.000000  77396.000000  77396.000000  77396.000000  77396.000000   
mean     3151.968073     13.425087      0.004880      0.004696     10.262888   
std      5395.925810     28.312495      0.003185      0.003188    169.152672   
min       101.000000      2.000000      0.002500      0.001500      0.000000   
25%       756.000000      3.000000      0.002900      0.002800      0.869123   
50%      1641.000000      6.000000      0.003600      0.003500      0.982071   
75%      3545.250000     13.000000      0.005400      0.005200      1.027848   
max    216036.000000   1582.000000      0.019900      0.035700  13870.000000   

            VARBIAS          QUAL      ODDRATIO  chipOrControl  
count  77396.000000  77396.000000  77396.000000        77396.0  
mean       1.390874     32.327237      1.988175            1.0  
std        2.334504      4.323360      5.406166           