In [3]:
#Import Libraries
import subprocess
import re
import pandas as pd
import os
import collections
import scipy.stats
import numpy as np
import scipy
import csv
import math
import seaborn as sns
import statsmodels.api as sm

from functools import reduce
from tqdm import tqdm
from matplotlib import pyplot as plt
%matplotlib inline
from pandas.tools.plotting import table
from scipy import interp
from sklearn import svm
from sklearn.svm import SVC
from sklearn.datasets import make_blobs, make_classification


from sklearn.feature_selection import SelectKBest, chi2, VarianceThreshold
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold, GridSearchCV, LeaveOneOut, train_test_split, StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, auc, roc_curve, precision_recall_curve, f1_score, classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils.multiclass import unique_labels
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from mpl_toolkits.mplot3d import Axes3D

## Creazione dataset

In [2]:
string_ss="som_mut/ss_6_32/files/"
string_sl="som_mut/sl_32/files/"
string_res="som_mut/res/files/"

In [3]:
path="clintab_GMQL/"+string_res
names=[]
barcodes_res=[]
filelist=os.listdir(path)
filelist.sort()
for file in filelist:

    if file.endswith(".meta"):
        tmp=pd.read_csv(path+file, sep='\t', header=None, index_col=0)
        tmp=tmp.T
        if tmp['biospecimen__shared__bcr_patient_barcode'].values[0] in barcodes_res:
            print(file)
        else:
            names.append(file[:11])
            barcodes_res.append(tmp['biospecimen__shared__bcr_patient_barcode'].values[0])

In [4]:
path="clintab_GMQL/"+string_res

col=['patient', 'chrom', 'start', 'stop', 'gene_symbol', 'entrez_id', 'variant_class', 'ref_allele']
r=[]
for i in range(len(names)):
    tmp=pd.read_csv(path+names[i], sep='\t', header=None)
    tmp=tmp.values
    p='R_00'
    if i < 10:
        p=p+'00'+str(i)
    elif i< 100:
        p=p+'0'+str(i)
    else:
            p=p+str(i)
    for j in range(tmp.shape[0]):
        chrom=tmp[j][0].replace("chr", "")
        r.append([p, chrom, tmp[j][1]+1, tmp[j][2], tmp[j][4], tmp[j][5], tmp[j][6], tmp[j][8]])

        
data_r=pd.DataFrame(r)
data_r.to_csv(path_or_buf ='clintab_GMQL/som_mut/res_sm.csv', sep='\t', header=col, index=False)
data_r.head() #ci sono dei nan

Unnamed: 0,0,1,2,3,4,5,6,7
0,R_00000,1,16648965,16648965,MST1P2,11209,RNA,C
1,R_00000,1,35736524,35736524,CLSPN,63967,Missense_Mutation,C
2,R_00000,1,88832840,88832840,PKN2,5586,Missense_Mutation,A
3,R_00000,1,241872095,241872095,EXO1,9156,Missense_Mutation,A
4,R_00000,2,201880094,201880094,CDK15,65061,Silent,C


In [5]:
data_r.columns=col
data_r.head()

Unnamed: 0,patient,chrom,start,stop,gene_symbol,entrez_id,variant_class,ref_allele
0,R_00000,1,16648965,16648965,MST1P2,11209,RNA,C
1,R_00000,1,35736524,35736524,CLSPN,63967,Missense_Mutation,C
2,R_00000,1,88832840,88832840,PKN2,5586,Missense_Mutation,A
3,R_00000,1,241872095,241872095,EXO1,9156,Missense_Mutation,A
4,R_00000,2,201880094,201880094,CDK15,65061,Silent,C


In [6]:
path="clintab_GMQL/"+string_sl
names=[]
barcodes_sl=[]
filelist=os.listdir(path)
filelist.sort()
for file in filelist:

    if file.endswith(".meta"):
        tmp=pd.read_csv(path+file, sep='\t', header=None, index_col=0)
        tmp=tmp.T
        if tmp['biospecimen__shared__bcr_patient_barcode'].values[0] in barcodes_sl:
            print(file)
        else:
            names.append(file[:11])
            barcodes_sl.append(tmp['biospecimen__shared__bcr_patient_barcode'].values[0])

In [7]:
path="clintab_GMQL/"+string_sl

col=['patient', 'chrom', 'start', 'stop', 'gene_symbol', 'entrez_id', 'variant_class', 'ref_allele']
sl=[]
for i in range(len(names)):
    tmp=pd.read_csv(path+names[i], sep='\t', header=None)
    tmp=tmp.values
    p='SL_00'
    if i < 10:
        p=p+'00'+str(i)
    elif i< 100:
        p=p+'0'+str(i)
    else:
            p=p+str(i)
    for j in range(tmp.shape[0]):
        chrom=tmp[j][0].replace("chr", "")
        sl.append([p, chrom, tmp[j][1]+1, tmp[j][2], tmp[j][4], tmp[j][5], tmp[j][6], tmp[j][8]])

        
data_sl=pd.DataFrame(sl)
data_sl.to_csv(path_or_buf ='clintab_GMQL/som_mut/sl_sm.csv', sep='\t', header=col, index=False)
data_sl.head() #ci sono dei nan

Unnamed: 0,0,1,2,3,4,5,6,7
0,SL_00000,1,12860019,12860019,PRAMEF2,65122,Missense_Mutation,A
1,SL_00000,1,13306742,13306742,RP11-219C24.6,0,RNA,A
2,SL_00000,1,17354542,17354542,PADI4,23569,Missense_Mutation,T
3,SL_00000,1,20483607,20483607,CAMK2N1,55450,3'UTR,G
4,SL_00000,1,23974977,23974977,SRSF10,10772,Missense_Mutation,T


In [8]:
data_sl.columns=col
data_sl.head()

Unnamed: 0,patient,chrom,start,stop,gene_symbol,entrez_id,variant_class,ref_allele
0,SL_00000,1,12860019,12860019,PRAMEF2,65122,Missense_Mutation,A
1,SL_00000,1,13306742,13306742,RP11-219C24.6,0,RNA,A
2,SL_00000,1,17354542,17354542,PADI4,23569,Missense_Mutation,T
3,SL_00000,1,20483607,20483607,CAMK2N1,55450,3'UTR,G
4,SL_00000,1,23974977,23974977,SRSF10,10772,Missense_Mutation,T


In [9]:
path="clintab_GMQL/"+string_ss
names=[]
barcodes_ss=[]
filelist=os.listdir(path)
filelist.sort()
for file in filelist:

    if file.endswith(".meta"):
        tmp=pd.read_csv(path+file, sep='\t', header=None, index_col=0)
        tmp=tmp.T
        if tmp['biospecimen__shared__bcr_patient_barcode'].values[0] in barcodes_ss:
            print(file)
        else:
            names.append(file[:11])
            barcodes_ss.append(tmp['biospecimen__shared__bcr_patient_barcode'].values[0])

In [10]:
path="clintab_GMQL/"+string_ss

col=['patient', 'chrom', 'start', 'stop', 'gene_symbol', 'entrez_id', 'variant_class', 'ref_allele']
ss=[]
for i in range(len(names)):
    tmp=pd.read_csv(path+names[i], sep='\t', header=None)
    tmp=tmp.values
    p='SS_00'
    if i < 10:
        p=p+'00'+str(i)
    elif i< 100:
        p=p+'0'+str(i)
    else:
            p=p+str(i)
    for j in range(tmp.shape[0]):
        chrom=tmp[j][0].replace("chr", "")
        ss.append([p, chrom, tmp[j][1]+1, tmp[j][2], tmp[j][4], tmp[j][5], tmp[j][6], tmp[j][8]])

        
data_ss=pd.DataFrame(ss)
data_ss.to_csv(path_or_buf ='clintab_GMQL/som_mut/ss_sm.csv', sep='\t', header=col, index=False)
data_ss.head() #ci sono dei nan

Unnamed: 0,0,1,2,3,4,5,6,7
0,SS_00000,1,9601338,9601338,TMEM201,199953,Silent,G
1,SS_00000,1,13115682,13115682,HNRNPCL2,440563,Missense_Mutation,G
2,SS_00000,1,21842274,21842274,HSPG2,3339,Missense_Mutation,G
3,SS_00000,1,26029228,26029228,EXTL1,2134,Missense_Mutation,T
4,SS_00000,1,35841739,35841739,AGO4,192670,Missense_Mutation,A


## Selezione feature

In [4]:
dset_res=pd.read_csv("clintab_GMQL/som_mut/res_sm.csv", delimiter='\t')
dset_senl=pd.read_csv("clintab_GMQL/som_mut/sl_sm.csv", delimiter='\t')
dset_sens=pd.read_csv("clintab_GMQL/som_mut/ss_sm.csv", delimiter='\t')

In [4]:
dset_sens

Unnamed: 0,patient,chrom,start,stop,gene_symbol,entrez_id,variant_class,ref_allele
0,SS_00000,1,9601338,9601338,TMEM201,199953,Silent,G
1,SS_00000,1,13115682,13115682,HNRNPCL2,440563,Missense_Mutation,G
2,SS_00000,1,21842274,21842274,HSPG2,3339,Missense_Mutation,G
3,SS_00000,1,26029228,26029228,EXTL1,2134,Missense_Mutation,T
4,SS_00000,1,35841739,35841739,AGO4,192670,Missense_Mutation,A
5,SS_00000,1,36093176,36093176,ADPRHL2,54936,Missense_Mutation,C
6,SS_00000,1,52911823,52911823,ECHDC2,55268,Intron,G
7,SS_00000,1,60040032,60040032,C1orf87,127795,Missense_Mutation,C
8,SS_00000,1,113981786,113981786,OLFML3,56944,3'UTR,T
9,SS_00000,1,150471326,150471326,RPRD2,23248,Missense_Mutation,G


In [19]:
#modifica!
sl_RNA=dset_senl[dset_senl['variant_class']=='RNA']
sl_silent=dset_senl[dset_senl['variant_class']=='Silent']
sl_MM=dset_senl[dset_senl['variant_class']=='Missense_Mutation']
sl_intron=dset_senl[dset_senl['variant_class']=='Intron']
sl_FSD=dset_senl[dset_senl['variant_class']=='Frame_Shift_Del']
sl_U=dset_senl[dset_senl['variant_class']=='3\'UTR']

In [20]:
ss_RNA=dset_sens[dset_sens['variant_class']=='RNA']
ss_silent=dset_sens[dset_sens['variant_class']=='Silent']
ss_MM=dset_sens[dset_sens['variant_class']=='Missense_Mutation']
ss_intron=dset_sens[dset_sens['variant_class']=='Intron']
ss_FSD=dset_sens[dset_sens['variant_class']=='Frame_Shift_Del']
ss_FSI=dset_sens[dset_sens['variant_class']=='Frame_Shift_Ins']
ss_U=dset_sens[dset_sens['variant_class']=='3\'UTR']

In [15]:
r_RNA=dset_res[dset_res['variant_class']=='RNA']
r_silent=dset_res[dset_res['variant_class']=='Silent']
r_MM=dset_res[dset_res['variant_class']=='Missense_Mutation']
r_intron=dset_res[dset_res['variant_class']=='Intron']
r_FSD=dset_res[dset_res['variant_class']=='Frame_Shift_Del']
r_FSI=dset_res[dset_res['variant_class']=='Frame_Shift_Ins']
r_U=dset_res[dset_res['variant_class']=='3\'UTR']

In [22]:
sl_U.shape

(295, 8)

## Intersezione 

In [62]:
ge1=pd.read_csv("new_data/gene_exprs/r_sl_bonf_ge.csv", delimiter=',')
ge2=pd.read_csv("new_data/gene_exprs/r_ss_bonf_ge.csv", delimiter=',')
ge3=pd.read_csv("new_data/gene_exprs/sl_ss_bonf_ge.csv", delimiter=',')

In [64]:
r_ge=ge1.append(ge2)
sl_ge=ge1.append(ge3)
ss_ge=ge2.append(ge3)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  sort=sort)


In [82]:
lista1=set(dset_res['gene_symbol'])
lista2=set(r_ge['gene_symbol'])
lista3=set(dset_sens['gene_symbol'])
lista4=set(ss_ge['gene_symbol'])
lista5=set(dset_senl['gene_symbol'])
lista6=set(sl_ge['gene_symbol'])

In [83]:
x=lista1.intersection(lista2)
y=lista3.intersection(lista4)
z=lista5.intersection(lista6)

In [84]:
print(x, y, z)

{'SPTA1', 'SYNM', 'AARS', 'DDR1', 'HYOU1', 'MYL4', 'HES1', 'MAGI3', 'RPS24', 'HIRA', 'NUGGC', 'ZNF366', 'HSPA5', 'PDE3A', 'APLP2', 'XXbac-BPG246D15.9', 'RPL32', 'TPMT', 'TNFAIP8L3', 'KDR', 'KCTD1', 'TMEM164', 'RPL11', 'EPAS1', 'VCP', 'ADCK2'} {'SPTA1', 'SYNM', 'AARS', 'PDIA4', 'HYOU1', 'DDR1', 'SYT12', 'CBX7', 'BCL2L13', 'MYL4', 'RNF121', 'MRGPRX2', 'HIRA', 'NUGGC', 'ZNF366', 'HSPA5', 'APLP2', 'HS3ST5', 'RPL32', 'FGFBP1', 'TNFAIP8L3', 'OR5T1', 'KDR', 'C4A', 'RGP1', 'TIRAP', 'RPL11', 'LACRT', 'EPAS1', 'EXOSC1', 'ADCK2', 'UHRF1BP1', 'FZD3'} {'FZD5', 'PDE3A', 'HES1', 'PDIA4', 'OR5T1', 'MECOM', 'C4A', 'KCTD11'}


In [85]:
mi1=pd.read_csv("new_data/miRNA/r_sl_bonf_miRNA.csv", delimiter=',')
mi2=pd.read_csv("new_data/miRNA/sl_ss_bonf_miRNA.csv", delimiter=',')

In [87]:
sl_mi=mi1.append(mi2)
sl_mi.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  sort=sort)


Unnamed: 0,ensemble_id,gene,gene_symbol,mannwhiteney_pvalue,p_value_corr,resistant_median,sensitive_long_median,sensitive_short_median
0,hsa,hsa-mir-1301,mir-1301,2.2e-05,0.002046,67.507454,126.137998,
0,hsa,hsa-mir-4761,mir-4761,0.000293,0.041252,,0.0,0.0
1,hsa,hsa-mir-1301,mir-1301,0.000303,0.042682,,126.137998,76.205448
2,hsa,hsa-mir-1180,mir-1180,0.000343,0.048305,,225.496917,153.256471


In [88]:
lista7=set(dset_senl['gene_symbol'])
lista8=set(sl_mi['gene'])

In [89]:
w=lista7.intersection(lista8)

In [91]:
met1=pd.read_csv("new_data/methylation/r_sl_bonf_meth.csv", delimiter=',')
met2=pd.read_csv("new_data/methylation/r_ss_bonf_meth.csv", delimiter=',')
met3=pd.read_csv("new_data/methylation/sl_ss_bonf_meth.csv", delimiter=',')

In [94]:
r_met=met1.append(met2)
sl_met=met1.append(met3)
ss_met=met2.append(met3)

ss_met.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  sort=sort)


Unnamed: 0,gene,mannwhiteney_pvalue,p_value_corr,resistant_median,sensitive_long_median,sensitive_short_median
0,RP11-886P16.10,0.000221,0.036857,0.032653,,0.025334
0,RILP,6e-06,0.001576,,0.335616,0.289588
1,SKIV2L,9e-06,0.003747,,0.931839,0.894482
2,PLEKHB1,3.3e-05,0.009131,,0.568565,0.274356
3,VPS37B,6.2e-05,0.017429,,0.429182,0.219542


In [96]:
lista_=set(r_met['gene'])
lista_1=set(ss_met['gene'])
lista_2=set(sl_met['gene'])

In [97]:
c=lista1.intersection(lista_)
d=lista3.intersection(lista_1)
e=lista5.intersection(lista_2)

In [98]:
print(c, d, e)

{'SKIV2L', 'AGO1'} {'HSPB2', 'SKIV2L', 'SEMA3G', 'ZNF560'} {'HSPB2', 'PLCE1', 'ALPK3', 'SEMA3G', 'TGM1'}


In [107]:
res=c.intersection(x)
ss=d.intersection(y)
sl=e.intersection(z & w)

In [108]:
print(res, ss, sl)

set() set() set()
