In [3]:
import pandas, numpy, math, seaborn
import scipy, scipy.stats
import matplotlib, matplotlib.pyplot as plt
# this is a trick to make figures look nicer
matplotlib.rcParams.update({'font.size':20, 'font.family':'FreeSans', 'xtick.labelsize':20, 'ytick.labelsize':10, 'figure.figsize':(12, 8)})

In [4]:
input_file_directory = '/Users/kja11/OneDrive - Háskóli Íslands/PhD ATG7//0 in_silico/Python/1)data_input/'
output_file_directory = '/Users/kja11/OneDrive - Háskóli Íslands/PhD ATG7/0 in_silico/Python/3)output/'

## functions & options


# II] Dataframe settings

## A) tissue_specific samples

In [5]:
%%time
# DL pancreas samples IDs for normal and primary tumors
path = input_file_directory +'samples_IDs/Organ_specific_samples/'
directory = path + "samples_pancreas.tsv"
pancreas_samples_IDs= pandas.read_csv(directory, sep = "\t")
print(pancreas_samples_IDs.shape)
pancreas_samples_IDs.head()

(345, 4)
Wall time: 14.9 ms


Unnamed: 0,sample,samples,_sample_type,_primary_site
0,GTEX-111CU-0526-SM-5EGHK,GTEX-111CU-0526-SM-5EGHK,Normal Tissue,Pancreas
1,GTEX-111YS-1226-SM-5EGGJ,GTEX-111YS-1226-SM-5EGGJ,Normal Tissue,Pancreas
2,GTEX-1122O-0726-SM-5GIEV,GTEX-1122O-0726-SM-5GIEV,Normal Tissue,Pancreas
3,GTEX-1128S-0826-SM-5GZZI,GTEX-1128S-0826-SM-5GZZI,Normal Tissue,Pancreas
4,GTEX-117YX-0226-SM-5EGH6,GTEX-117YX-0226-SM-5EGH6,Normal Tissue,Pancreas


In [6]:
aaa = pancreas_samples_IDs['_sample_type'].value_counts()
print(aaa)

#select IDs of Normal and Primary
pancreas_normal_tissue_IDs = pancreas_samples_IDs.loc[pancreas_samples_IDs['_sample_type'] == "Normal Tissue"]
pancreas_primary_tumor_IDs = pancreas_samples_IDs.loc[pancreas_samples_IDs['_sample_type'] == "Primary Tumor"]
print('normal →', len(pancreas_normal_tissue_IDs))
print('primary →', len(pancreas_primary_tumor_IDs))

Primary Tumor    178
Normal Tissue    167
Name: _sample_type, dtype: int64
normal → 167
primary → 178


In [7]:
#convert IDs to list
pancreas_normal_tissue_IDs = pancreas_normal_tissue_IDs['sample'].to_list()
pancreas_primary_tumor_IDs = pancreas_primary_tumor_IDs['sample'].to_list()

## B) protcoding_expr file ENSEMBL

In [8]:
%%time
#Download all the ensembl_normal_protcoding_expr file

path = output_file_directory + "dataframes_for_input/"
ensembl_normal_protcoding_expr = pandas.read_csv(path+"ensembl_normal_protcoding_expr.tsv", sep = "\t")
print(ensembl_normal_protcoding_expr.shape)

ensembl_primary_protcoding_expr = pandas.read_csv(path+"ensembl_primary_protcoding_expr.tsv", sep = "\t")
print(ensembl_primary_protcoding_expr.shape)
ensembl_primary_protcoding_expr.head(2)

(7429, 19383)
(9185, 19383)
Wall time: 3min 33s


Unnamed: 0,sample,ENSG00000198888,ENSG00000198763,ENSG00000198804,ENSG00000198712,ENSG00000228253,ENSG00000198899,ENSG00000198938,ENSG00000198840,ENSG00000212907,...,ENSG00000160678,ENSG00000160679,ENSG00000143553,ENSG00000214193,ENSG00000196182,ENSG00000181817,ENSG00000116885,ENSG00000116898,ENSG00000119535,ENSG00000142694
0,TCGA-02-0047-01,12.9202,13.122,14.0136,13.5251,13.8439,13.7777,13.4037,12.8123,13.4187,...,6.4741,5.8017,6.4975,2.1509,5.2445,6.0507,3.8915,5.5389,4.2936,4.6685
1,TCGA-02-0055-01,10.3972,10.7774,12.3945,12.3893,11.728,11.9408,12.2022,11.0142,11.0221,...,5.9969,5.6888,6.0633,1.9822,4.3299,6.4992,3.7582,6.7092,4.5964,5.5145


In [9]:
%%time
#put sample label to index to use .loc
dfs = ensembl_normal_protcoding_expr, ensembl_primary_protcoding_expr
for df in dfs:
    df.set_index("sample", inplace = True)
    
#Do subset
ensembl_normal_pancreas_protcoding_expr = ensembl_normal_protcoding_expr.loc[pancreas_normal_tissue_IDs]
ensembl_primary_pancreas_protcoding_expr = ensembl_primary_protcoding_expr.loc[pancreas_primary_tumor_IDs]

ensembl_normal_pancreas_protcoding_expr.reset_index(inplace=True)
ensembl_primary_pancreas_protcoding_expr.reset_index(inplace=True)

print('normal →', ensembl_normal_pancreas_protcoding_expr.shape)
print('primary →', ensembl_primary_pancreas_protcoding_expr.shape)
ensembl_normal_pancreas_protcoding_expr.head(2)

normal → (167, 19383)
primary → (178, 19383)
Wall time: 132 ms


Unnamed: 0,sample,ENSG00000198888,ENSG00000198763,ENSG00000198804,ENSG00000198712,ENSG00000228253,ENSG00000198899,ENSG00000198938,ENSG00000198840,ENSG00000212907,...,ENSG00000160678,ENSG00000160679,ENSG00000143553,ENSG00000214193,ENSG00000196182,ENSG00000181817,ENSG00000116885,ENSG00000116898,ENSG00000119535,ENSG00000142694
0,GTEX-111CU-0526-SM-5EGHK,12.3812,12.9244,12.4357,13.0035,14.1,13.3847,12.6538,12.7262,12.3425,...,2.3508,4.7138,2.8117,-1.0262,3.9393,3.3661,1.2992,4.554,0.2154,1.6558
1,GTEX-111YS-1226-SM-5EGGJ,12.7144,13.209,12.8179,13.1506,14.0744,13.4626,12.7196,13.0001,12.6243,...,0.547,4.3413,2.5263,-1.2828,3.3717,3.1212,1.6093,4.3758,-1.4305,1.3901


In [11]:
#save to csv
path = 'dataframes_for_input/pancreas/'
ensembl_normal_pancreas_protcoding_expr.to_csv(output_file_directory+path+'ensembl_normal_pancreas_protcoding_expr.tsv', sep = "\t", index=False)
ensembl_primary_pancreas_protcoding_expr.to_csv(output_file_directory+path+'ensembl_primary_pancreas_protcoding_expr.tsv', sep = "\t", index=False)

## C) protcoding_expr file SYMBOL

In [None]:
# #Download all the symbol_normal_protcoding_expr file
# %%time
# symbol_normal_protcoding_expr = pandas.read_csv("symbol_normal_protcoding_expr.tsv", sep = "\t")
# print(symbol_normal_protcoding_expr.shape)

# symbol_primary_protcoding_expr = pandas.read_csv("symbol_primary_protcoding_expr.tsv", sep = "\t")
# print(symbol_primary_protcoding_expr.shape)
# symbol_primary_protcoding_expr.head(2)

In [None]:
# %%time
# #Do subset
# symbol_normal_pancreas_protcoding_expr = symbol_normal_protcoding_expr[pancreas_symbol_normal_tissue_IDs]
# symbol_primary_pancreas_protcoding_expr = symbol_primary_protcoding_expr[pancreas_symbol_primary_tumor_IDs]

# symbol_normal_pancreas_protcoding_expr.reset_index(inplace=True)
# symbol_primary_pancreas_protcoding_expr.reset_index(inplace=True)

# print('normal →', symbol_normal_pancreas_protcoding_expr.shape)
# print('primary →', symbol_primary_pancreas_protcoding_expr.shape)
# symbol_normal_pancreas_protcoding_expr.head(2)

In [None]:
# #save to csv
# symbol_normal_pancreas_protcoding_expr.to_csv('symbol_normal_pancreas_protcoding_expr.tsv', sep = "\t", index=False)
# symbol_primary_pancreas_protcoding_expr.to_csv('symbol_primary_pancreas_protcoding_expr.tsv', sep = "\t", index=False)