In [1]:
import pandas, numpy, math, seaborn
import scipy, scipy.stats
import matplotlib, matplotlib.pyplot as plt
# this is a trick to make figures look nicer
matplotlib.rcParams.update({'font.size':20, 'font.family':'FreeSans', 'xtick.labelsize':20, 'ytick.labelsize':10, 'figure.figsize':(12, 8)})

In [2]:
input_file_directory = '/Users/kja11/OneDrive - Háskóli Íslands/PhD ATG7//0 in_silico/Python/1)data_input/'
output_file_directory = '/Users/kja11/OneDrive - Háskóli Íslands/PhD ATG7/0 in_silico/Python/3)output/'

## functions & options


In [3]:
#define a function to split after the ".". The 1 is for how many word to have. if we have several dot. The [1] is what is after the dot, and [0] what is before
def split(L):
  return L.split(".",1)[0]

# II] Dataframe settings

In [4]:
%%time
path = input_file_directory + "protein_coding_list.txt"
protein_coding_list= pandas.read_csv(path, sep = "\t")
print(protein_coding_list.shape)
protein_coding_list.head()

(22796, 4)
Wall time: 30.9 ms


Unnamed: 0,ensembl_gene_id,hgnc_symbol,entrezgene_id,transcript_biotype
1,ENSG00000198888,MT-ND1,4535.0,protein_coding
2,ENSG00000198763,MT-ND2,4536.0,protein_coding
3,ENSG00000198804,MT-CO1,4512.0,protein_coding
4,ENSG00000198712,MT-CO2,4513.0,protein_coding
5,ENSG00000228253,MT-ATP8,4509.0,protein_coding


## A) tissue_specific samples

In [5]:
%%time
# DL data gene expression for Normal Tissue and Primary Tumors
path = input_file_directory +'/samples_IDs/Organ_specific_samples/'

directory = path+ "samples_blood.tsv"
blood_samples_IDs= pandas.read_csv(directory, sep = "\t")
print(blood_samples_IDs.shape)
blood_samples_IDs.head()

(1419, 4)
Wall time: 5.98 ms


Unnamed: 0,sample,samples,_primary_site,_sample_type
0,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0526-SM-5EGHJ,Blood Vessel,Normal Tissue
1,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0626-SM-5N9CS,Blood Vessel,Normal Tissue
2,GTEX-111FC-0426-SM-5N9CV,GTEX-111FC-0426-SM-5N9CV,Blood Vessel,Normal Tissue
3,GTEX-111YS-2226-SM-5987P,GTEX-111YS-2226-SM-5987P,Blood Vessel,Normal Tissue
4,GTEX-1122O-0426-SM-5H12G,GTEX-1122O-0426-SM-5H12G,Blood Vessel,Normal Tissue


In [6]:
blood_normal_tissue_IDs = blood_samples_IDs[(blood_samples_IDs['_primary_site'] == 'Blood Vessel') | 
                      (blood_samples_IDs['_primary_site'] == 'Blood') & (blood_samples_IDs['_sample_type'] != 'Cell Line')]
blood_primary_tumor_IDs = blood_samples_IDs[blood_samples_IDs['_sample_type'].str.match('(Primary Blood Derived Cancer - Bone Marrow)|(Primary Blood Derived Cancer - Peripheral Blood)')]

print('normal →', len(blood_normal_tissue_IDs))
print('primary →', len(blood_primary_tumor_IDs))

normal → 943
primary → 476


In [12]:
print("__Normal blood:\n",blood_normal_tissue_IDs['_primary_site'].value_counts())
print()
print("__Primary tumor blood:\n",blood_primary_tumor_IDs['_sample_type'].value_counts())

__Normal blood:
 Blood Vessel    606
Blood           337
Name: _primary_site, dtype: int64

__Primary tumor blood:
 Primary Blood Derived Cancer - Peripheral Blood    239
Primary Blood Derived Cancer - Bone Marrow         237
Name: _sample_type, dtype: int64


In [8]:
#convert IDs to list
normal_samples_labels  = blood_normal_tissue_IDs['sample'].to_list()
primary_samples_labels  = blood_primary_tumor_IDs['sample'].to_list()

In [9]:
%%time
#Download the big expression data
path = input_file_directory + "/xenabrowser_brut_data/TcgaTargetGtex_rsem_gene_tpm.tsv"
df_gexpr_all = pandas.read_csv(path, sep = "\t", index_col='sample')

print(df_gexpr_all.shape)
df_gexpr_all.head(2)

(60498, 19131)
Wall time: 13min 27s


Unnamed: 0_level_0,GTEX-S4Q7-0003-SM-3NM8M,TCGA-19-1787-01,TCGA-S9-A7J2-01,GTEX-QV31-1626-SM-2S1QC,TCGA-G3-A3CH-11,TCGA-B5-A5OE-01,GTEX-13QIC-0011-R1a-SM-5O9CJ,TCGA-B2-5641-11,GTEX-ZPCL-0126-SM-4WWC8,TARGET-20-PANGDN-09,...,TCGA-FI-A2EY-01,TCGA-55-6985-11,TCGA-EJ-5527-01,TCGA-G3-A25X-01,TCGA-24-2254-01,GTEX-11ZTS-3326-SM-5LU9Y,GTEX-VJYA-0726-SM-4KL1T,GTEX-ZA64-2126-SM-5Q5A8,GTEX-Q2AG-2826-SM-2HMJQ,GTEX-XV7Q-0426-SM-4BRVN
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000242268.2,-3.458,-9.9658,0.2998,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,...,-4.035,-2.0529,-9.9658,-9.9658,-1.9379,1.5165,-9.9658,-2.3884,0.044,-3.3076
ENSG00000259041.1,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,...,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658


In [13]:
%%time
#remove dot in ENSEMBL name. Function split created at the beginning
#remove dot in ens name
df_gexpr_all = df_gexpr_all.reset_index()
df_gexpr_all["sample"] = df_gexpr_all["sample"].apply(split)
#see if have duplicate
print('number of duplicated genes:', df_gexpr_all.duplicated('sample').sum())

#Merge with prot coding list
df_gexpr_all.rename(columns = {'sample' : 'ensembl_gene_id'}, inplace = True)
dfmerged = protein_coding_list.merge(df_gexpr_all)
print('number of duplicated genes after merge with codlist:', dfmerged.duplicated('ensembl_gene_id').sum())
print(dfmerged.shape)
dfmerged.head(2)

number of duplicated genes: 0
number of duplicated genes after merge with codlist: 154
(19536, 19136)
Wall time: 1min 36s


Unnamed: 0,ensembl_gene_id,hgnc_symbol,entrezgene_id,transcript_biotype,index,GTEX-S4Q7-0003-SM-3NM8M,TCGA-19-1787-01,TCGA-S9-A7J2-01,GTEX-QV31-1626-SM-2S1QC,TCGA-G3-A3CH-11,...,TCGA-FI-A2EY-01,TCGA-55-6985-11,TCGA-EJ-5527-01,TCGA-G3-A25X-01,TCGA-24-2254-01,GTEX-11ZTS-3326-SM-5LU9Y,GTEX-VJYA-0726-SM-4KL1T,GTEX-ZA64-2126-SM-5Q5A8,GTEX-Q2AG-2826-SM-2HMJQ,GTEX-XV7Q-0426-SM-4BRVN
0,ENSG00000198888,MT-ND1,4535.0,protein_coding,438,11.939,13.1286,12.9213,12.3347,13.3366,...,13.9398,12.9477,11.8238,9.8547,13.2571,13.0066,12.2818,13.3569,14.3486,11.6654
1,ENSG00000198763,MT-ND2,4536.0,protein_coding,22572,11.7935,13.0793,12.7126,12.9118,13.4186,...,14.3906,13.0265,11.7369,9.3562,13.0466,13.5249,12.6554,13.9404,14.7075,12.8655


In [14]:
#Control of the merge. See value of ENSG00000198712 before merging
print(df_gexpr_all.loc[df_gexpr_all['ensembl_gene_id'] == 'ENSG00000198712']['GTEX-QV31-1626-SM-2S1QC'])

#Control of the merge. See value of ENSG00000198712 after merging. have to be the same
print(dfmerged.loc[dfmerged['ensembl_gene_id'] == 'ENSG00000198712']['GTEX-QV31-1626-SM-2S1QC'])

28551    13.4472
Name: GTEX-QV31-1626-SM-2S1QC, dtype: float64
3    13.4472
Name: GTEX-QV31-1626-SM-2S1QC, dtype: float64


In [15]:
%%time
#Isolate Normal and Primary tissues
##keep the gene_ids
df_part1 = dfmerged.iloc[:,0:4]

##subset
normal_protcoding = dfmerged.iloc[:,4:][normal_samples_labels]
primary_protcoding = dfmerged.iloc[:,4:][primary_samples_labels]

normal_protcoding = pandas.concat([df_part1, normal_protcoding], axis=1)
primary_protcoding = pandas.concat([df_part1, primary_protcoding], axis=1)

print(normal_protcoding.shape)
print(primary_protcoding.shape)
normal_protcoding.head(2)

(19536, 947)
(19536, 480)
Wall time: 1.77 s


Unnamed: 0,ensembl_gene_id,hgnc_symbol,entrezgene_id,transcript_biotype,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-111FC-0426-SM-5N9CV,GTEX-111YS-2226-SM-5987P,GTEX-1122O-0426-SM-5H12G,GTEX-1122O-1126-SM-5NQ8X,...,GTEX-ZV68-0006-SM-4YCEJ,GTEX-ZVE2-0006-SM-51MRW,GTEX-ZVP2-0005-SM-51MRK,GTEX-ZVT3-0006-SM-51MT9,GTEX-ZVT4-0006-SM-57WB8,GTEX-ZVTK-0006-SM-57WBK,GTEX-ZVZP-0006-SM-51MSW,GTEX-ZVZQ-0006-SM-51MR8,GTEX-ZXES-0005-SM-57WCB,GTEX-ZXG5-0005-SM-57WCN
0,ENSG00000198888,MT-ND1,4535.0,protein_coding,12.8924,12.6263,11.1811,12.4851,12.8337,12.3666,...,9.2509,7.442,10.0559,9.6723,11.0944,10.3162,11.743,10.3978,9.7706,10.6086
1,ENSG00000198763,MT-ND2,4536.0,protein_coding,12.6612,13.0833,11.3615,12.9074,13.1778,12.8186,...,9.8035,7.762,9.9784,9.7626,10.8643,10.3212,11.1922,10.9157,9.5933,10.8215


In [16]:
#test if everything is still good after concat and subset
print(dfmerged['GTEX-ZV68-0006-SM-4YCEJ'][12])
print(normal_protcoding['GTEX-ZV68-0006-SM-4YCEJ'][12])

9.4965
9.4965


In [17]:
print("Normal tissue")
print("is nan in ensembl:", normal_protcoding['ensembl_gene_id'].isna().sum())
print(' number of duplicate:',normal_protcoding.duplicated('ensembl_gene_id').sum())
print("is nan in symbol:",normal_protcoding['hgnc_symbol'].isna().sum())
print(' number of duplicate:',normal_protcoding.duplicated('hgnc_symbol').sum())
print()
print("Primary Tumor")
print("is nan in ensembl:", primary_protcoding['ensembl_gene_id'].isna().sum())
print(' number of duplicate:',primary_protcoding.duplicated('ensembl_gene_id').sum())
print("is nan in symbol:",primary_protcoding['hgnc_symbol'].isna().sum())
print(' number of duplicate:',primary_protcoding.duplicated('hgnc_symbol').sum())

Normal tissue
is nan in ensembl: 0
 number of duplicate: 154
is nan in symbol: 193
 number of duplicate: 347

Primary Tumor
is nan in ensembl: 0
 number of duplicate: 154
is nan in symbol: 193
 number of duplicate: 347


## B) protcoding_expr file ENSEMBL

In [20]:
#see how many duplicates in genes
print('normal shape:', normal_protcoding.shape)
print('number of duplicate:',normal_protcoding.duplicated('ensembl_gene_id').sum())
print()
print('primary shape is:', primary_protcoding.shape)
print('number of duplicate:',primary_protcoding.duplicated('ensembl_gene_id').sum())
print()

#delete duplicates
normal_protcoding.drop_duplicates(subset = "ensembl_gene_id", inplace = True)
primary_protcoding.drop_duplicates(subset = "ensembl_gene_id", inplace = True)

normal shape: (19382, 947)
number of duplicate: 0

primary shape is: (19382, 480)
number of duplicate: 0



In [21]:
%%time
#Preparation of the global dataframe
protcoding_final = []
data = [normal_protcoding, primary_protcoding]

for df in data:
    #transpose the table 
    df = numpy.transpose(df)

    #Change head column & delete the name of the column index; choose ENSEMBL
    df.columns = df.iloc[0]
    df.columns.name = None
    df = df.reset_index()

    #Delete the first rows & rename the columns to merge 
    df = df.drop(df.index[0:4])
    df.rename(columns = {'index' : 'sample'}, inplace = True)

    protcoding_final.append(df)
    
ensembl_normal_protcoding_expr = protcoding_final[0]
ensembl_primary_protcoding_expr = protcoding_final[1]

print('normal shape =', ensembl_normal_protcoding_expr.shape)
print('primary shape =', ensembl_primary_protcoding_expr.shape)
ensembl_primary_protcoding_expr.head()

normal shape = (943, 19383)
primary shape = (476, 19383)
Wall time: 5.98 s


Unnamed: 0,sample,ENSG00000198888,ENSG00000198763,ENSG00000198804,ENSG00000198712,ENSG00000228253,ENSG00000198899,ENSG00000198938,ENSG00000198840,ENSG00000212907,...,ENSG00000160678,ENSG00000160679,ENSG00000143553,ENSG00000214193,ENSG00000196182,ENSG00000181817,ENSG00000116885,ENSG00000116898,ENSG00000119535,ENSG00000142694
4,TARGET-10-PAMXHJ-09,13.4466,14.1354,14.8637,14.67,15.2483,14.1204,14.5576,11.628,13.5521,...,5.4357,5.2028,5.1461,-2.2447,4.2706,4.0799,0.346,5.2631,6.2163,1.6093
5,TARGET-10-PAMXSP-09,14.3048,14.88,14.6638,14.581,15.6861,14.5357,15.2548,12.5331,13.513,...,7.3205,4.8885,2.6783,-1.3548,4.2699,2.2901,0.058,5.1306,7.0387,2.6464
6,TARGET-10-PANEUH-09,13.6398,13.5026,15.1044,14.5125,15.4857,14.6477,14.9213,11.7749,13.6412,...,4.2563,6.3537,2.77,-2.1779,4.2017,4.7453,0.3907,4.846,6.0204,1.7786
7,TARGET-10-PANJPG-09,13.5907,13.4952,13.8432,12.9796,14.3668,13.7255,14.1158,11.6554,12.8893,...,5.3625,4.9117,4.2142,1.0363,4.4033,3.0305,-1.3548,5.1069,4.4169,3.8411
8,TARGET-10-PANJWJ-09,14.8504,14.7786,14.9714,14.9369,16.163,15.1122,15.5269,13.7731,14.5301,...,2.4225,4.9369,3.2034,-2.114,4.0567,2.872,-0.8084,5.1579,6.3799,2.6232


In [None]:
#save to csv
path = 'dataframes_for_input/blood/'
ensembl_normal_protcoding_expr.to_csv(output_file_directory+path+'ensembl_normal_blood_protcoding_expr.tsv',sep = "\t", index=False)
ensembl_primary_protcoding_expr.to_csv(output_file_directory+path+'ensembl_primary_blood_protcoding_expr.tsv',sep = "\t", index=False)

# C) protcoding_expr file SYMBOL

In [22]:
%%time
#Preparation of the global dataframe
protcoding_final = []
data = [normal_protcoding, primary_protcoding]

for df in data:
    df = df.dropna(subset=['hgnc_symbol'])
    #transpose the table 
    df = numpy.transpose(df)

    #Change head column & delete the name of the column index; choose ENSEMBL
    df.columns = df.iloc[1]
    df.columns.name = None
    df = df.reset_index()

    #Delete the first rows & columns rename to merge 
    df = df.drop(df.index[0:4])
    df.rename(columns = {'index' : 'sample'}, inplace = True)

    protcoding_final.append(df)
    
symbol_normal_protcoding = protcoding_final[0]
symbol_primary_protcoding = protcoding_final[1]

print('normal shape =', symbol_normal_protcoding.shape)
print('primary shape =', symbol_primary_protcoding.shape)
symbol_normal_protcoding.head()

normal shape = (943, 19190)
primary shape = (476, 19190)
Wall time: 5.93 s


Unnamed: 0,sample,MT-ND1,MT-ND2,MT-CO1,MT-CO2,MT-ATP8,MT-ATP6,MT-CO3,MT-ND3,MT-ND4L,...,S100A1,CHTOP,SNAPIN,SH3D21,STK40,LSM10,OSCP1,MRPS15,CSF3R,EVA1B
4,GTEX-1117F-0526-SM-5EGHJ,12.8924,12.6612,12.5008,12.8456,13.9252,13.3517,13.4492,11.6795,12.2196,...,0.5568,5.1692,4.701,2.8522,5.6065,4.9801,2.1411,4.9824,2.4675,5.55
5,GTEX-1117F-0626-SM-5N9CS,12.6263,13.0833,11.3334,12.6928,14.1037,13.4508,13.2862,12.9488,12.1293,...,2.7292,5.9507,5.6177,1.4441,5.1392,5.2791,2.5707,5.7211,4.178,6.3669
6,GTEX-111FC-0426-SM-5N9CV,11.1811,11.3615,13.1335,12.6413,13.1085,12.5584,13.017,13.2811,11.2169,...,2.6487,5.288,5.5255,1.4859,5.5916,5.4595,2.5011,5.7334,1.5998,5.9912
7,GTEX-111YS-2226-SM-5987P,12.4851,12.9074,13.3488,13.4951,14.3929,13.7707,13.49,13.2595,11.9275,...,1.4911,5.6177,5.478,0.688,5.3899,4.6747,2.296,5.76,3.0463,6.1904
8,GTEX-1122O-0426-SM-5H12G,12.8337,13.1778,13.6406,13.9173,14.4532,13.8407,13.7612,13.7707,11.4993,...,1.8484,5.3431,5.6241,0.3115,4.9946,5.1575,2.5804,5.7198,2.3077,5.0023


In [None]:
#save to csv
path = 'dataframes_for_input/blood/'
symbol_normal_protcoding.to_csv(output_file_directory+path+'symbol_normal_blood_protcoding_expr.tsv',sep = "\t", index=False)
symbol_primary_protcoding.to_csv(output_file_directory+path+'symbol_primary_blood_protcoding_expr.tsv',sep = "\t", index=False)