In [40]:
import pandas, numpy, math, seaborn
import scipy, scipy.stats
import matplotlib, matplotlib.pyplot as plt
# this is a trick to make figures look nicer
matplotlib.rcParams.update({'font.size':20, 'font.family':'FreeSans', 'xtick.labelsize':20, 'ytick.labelsize':10, 'figure.figsize':(12, 8)})

In [41]:
input_file_directory = '/Users/kja11/OneDrive - Háskóli Íslands/PhD ATG7//0 in_silico/Python/1)data_input/'
output_file_directory = '/Users/kja11/OneDrive - Háskóli Íslands/PhD ATG7/0 in_silico/Python/3)output/'

## functions & options


# II] Dataframe settings

## A) tissue_specific samples

In [45]:
%%time
# DL liver samples IDs for normal and primary tumors
path = input_file_directory +'samples_IDs/Organ_specific_samples/'
directory = path + "samples_liver.tsv"
liver_samples_IDs= pandas.read_csv(directory, sep = "\t")
print(liver_samples_IDs.shape)
liver_samples_IDs.head()

(479, 4)
Wall time: 3.99 ms


Unnamed: 0,sample,samples,_sample_type,_primary_site
0,GTEX-1192X-1026-SM-5H12P,GTEX-1192X-1026-SM-5H12P,Normal Tissue,Liver
1,GTEX-11DXY-0526-SM-5EGGQ,GTEX-11DXY-0526-SM-5EGGQ,Normal Tissue,Liver
2,GTEX-11DXZ-0126-SM-5EGGY,GTEX-11DXZ-0126-SM-5EGGY,Normal Tissue,Liver
3,GTEX-11EQ9-0526-SM-5A5JZ,GTEX-11EQ9-0526-SM-5A5JZ,Normal Tissue,Liver
4,GTEX-11NUK-1226-SM-5P9GM,GTEX-11NUK-1226-SM-5P9GM,Normal Tissue,Liver


In [13]:
aaa = liver_samples_IDs['_sample_type'].value_counts()
print(aaa)

#select IDs of Normal and Primary
liver_normal_tissue_IDs = liver_samples_IDs.loc[liver_samples_IDs['_sample_type'] == "Normal Tissue"]
liver_primary_tumor_IDs = liver_samples_IDs.loc[liver_samples_IDs['_sample_type'] == "Primary Tumor"]
print('normal →', len(liver_normal_tissue_IDs))
print('primary →', len(liver_primary_tumor_IDs))

Primary Tumor    369
Normal Tissue    110
Name: _sample_type, dtype: int64
normal → 110
primary → 369


In [14]:
#convert IDs to list
liver_normal_tissue_IDs = liver_normal_tissue_IDs['sample'].to_list()
liver_primary_tumor_IDs = liver_primary_tumor_IDs['sample'].to_list()

## B) protcoding_expr file ENSEMBL

In [25]:
%%time
#Download all the ensembl_normal_protcoding_expr file

path = output_file_directory + "dataframes_for_input/"
ensembl_normal_protcoding_expr = pandas.read_csv(path+"ensembl_normal_protcoding_expr.tsv", sep = "\t")
print(ensembl_normal_protcoding_expr.shape)

ensembl_primary_protcoding_expr = pandas.read_csv(path+"ensembl_primary_protcoding_expr.tsv", sep = "\t")
print(ensembl_primary_protcoding_expr.shape)
ensembl_primary_protcoding_expr.head(2)

(7429, 19383)
(9185, 19383)
Wall time: 3min 41s


Unnamed: 0,sample,ENSG00000198888,ENSG00000198763,ENSG00000198804,ENSG00000198712,ENSG00000228253,ENSG00000198899,ENSG00000198938,ENSG00000198840,ENSG00000212907,...,ENSG00000160678,ENSG00000160679,ENSG00000143553,ENSG00000214193,ENSG00000196182,ENSG00000181817,ENSG00000116885,ENSG00000116898,ENSG00000119535,ENSG00000142694
0,TCGA-02-0047-01,12.9202,13.122,14.0136,13.5251,13.8439,13.7777,13.4037,12.8123,13.4187,...,6.4741,5.8017,6.4975,2.1509,5.2445,6.0507,3.8915,5.5389,4.2936,4.6685
1,TCGA-02-0055-01,10.3972,10.7774,12.3945,12.3893,11.728,11.9408,12.2022,11.0142,11.0221,...,5.9969,5.6888,6.0633,1.9822,4.3299,6.4992,3.7582,6.7092,4.5964,5.5145


In [35]:
%%time
#put sample label to index to use .loc
dfs = ensembl_normal_protcoding_expr, ensembl_primary_protcoding_expr
for df in dfs:
    df.set_index("sample", inplace = True)
    
#Do subset
ensembl_normal_liver_protcoding_expr = ensembl_normal_protcoding_expr.loc[liver_normal_tissue_IDs]
ensembl_primary_liver_protcoding_expr = ensembl_primary_protcoding_expr.loc[liver_primary_tumor_IDs]

ensembl_normal_liver_protcoding_expr.reset_index(inplace=True)
ensembl_primary_liver_protcoding_expr.reset_index(inplace=True)

print('normal →', ensembl_normal_liver_protcoding_expr.shape)
print('primary →', ensembl_primary_liver_protcoding_expr.shape)
ensembl_normal_liver_protcoding_expr.head(2)

normal → (110, 19383)
primary → (369, 19383)
Wall time: 146 ms


Unnamed: 0,sample,ENSG00000198888,ENSG00000198763,ENSG00000198804,ENSG00000198712,ENSG00000228253,ENSG00000198899,ENSG00000198938,ENSG00000198840,ENSG00000212907,...,ENSG00000160678,ENSG00000160679,ENSG00000143553,ENSG00000214193,ENSG00000196182,ENSG00000181817,ENSG00000116885,ENSG00000116898,ENSG00000119535,ENSG00000142694
0,GTEX-1192X-1026-SM-5H12P,8.9743,11.2312,10.6905,12.2497,12.0025,11.8226,11.4992,10.9613,10.4287,...,1.7229,5.9892,5.4738,3.055,4.8929,4.9031,3.2766,6.3815,3.3278,6.2589
1,GTEX-11DXY-0526-SM-5EGGQ,14.0574,14.9857,15.451,15.6933,15.7066,15.3651,15.2979,14.9319,15.1432,...,1.674,4.2033,3.7302,-1.685,3.6939,3.9892,-0.5332,5.4047,1.1117,1.2934


In [39]:
#save to csv
path = 'dataframes_for_input/liver/'
ensembl_normal_liver_protcoding_expr.to_csv(output_file_directory+path+'ensembl_normal_liver_protcoding_expr.tsv', sep = "\t", index=False)
ensembl_primary_liver_protcoding_expr.to_csv(output_file_directory+path+'ensembl_primary_liver_protcoding_expr.tsv', sep = "\t", index=False)

## C) protcoding_expr file SYMBOL

In [None]:
# #Download all the symbol_normal_protcoding_expr file
# %%time
# symbol_normal_protcoding_expr = pandas.read_csv("symbol_normal_protcoding_expr.tsv", sep = "\t")
# print(symbol_normal_protcoding_expr.shape)

# symbol_primary_protcoding_expr = pandas.read_csv("symbol_primary_protcoding_expr.tsv", sep = "\t")
# print(symbol_primary_protcoding_expr.shape)
# symbol_primary_protcoding_expr.head(2)

In [None]:
# %%time
# #Do subset
# symbol_normal_liver_protcoding_expr = symbol_normal_protcoding_expr[liver_symbol_normal_tissue_IDs]
# symbol_primary_liver_protcoding_expr = symbol_primary_protcoding_expr[liver_symbol_primary_tumor_IDs]

# symbol_normal_liver_protcoding_expr.reset_index(inplace=True)
# symbol_primary_liver_protcoding_expr.reset_index(inplace=True)

# print('normal →', symbol_normal_liver_protcoding_expr.shape)
# print('primary →', symbol_primary_liver_protcoding_expr.shape)
# symbol_normal_liver_protcoding_expr.head(2)

In [None]:
# #save to csv
# symbol_normal_liver_protcoding_expr.to_csv('symbol_normal_liver_protcoding_expr.tsv', sep = "\t", index=False)
# symbol_primary_liver_protcoding_expr.to_csv('symbol_primary_liver_protcoding_expr.tsv', sep = "\t", index=False)