In [None]:
import pandas, numpy, math, seaborn
import scipy, scipy.stats
import matplotlib, matplotlib.pyplot as plt
# this is a trick to make figures look nicer
matplotlib.rcParams.update({'font.size':20, 'font.family':'FreeSans', 'xtick.labelsize':20, 'ytick.labelsize':10, 'figure.figsize':(12, 8)})

In [None]:
input_file_directory = '/Users/kja11/OneDrive - Háskóli Íslands/PhD ATG7//0 in_silico/Python/1)data_input/'
output_file_directory = '/Users/kja11/OneDrive - Háskóli Íslands/PhD ATG7/0 in_silico/Python/3)output/'

## functions & options


In [None]:
#define a function to split after the ".". The 1 is for how many word to have. if we have several dot. The [1] is what is after the dot, and [0] what is before
def split(L):
  return L.split(".",1)[0]

# II] Dataframe settings

In [None]:
%%time
path = input_file_directory + "protein_coding_list.txt"
protein_coding_list= pandas.read_csv(path, sep = "\t")
print(protein_coding_list.shape)
protein_coding_list.head()

## A) tissue_specific samples

In [None]:
%%time
# DL data gene expression for Normal Tissue and Primary Tumors
path = input_file_directory +'/samples_IDs/Organ_specific_samples/'

directory = path+ "samples_blood.tsv"
blood_samples_IDs= pandas.read_csv(directory, sep = "\t")
print(blood_samples_IDs.shape)
blood_samples_IDs.head()

In [None]:
blood_normal_tissue_IDs = blood_samples_IDs[(blood_samples_IDs['_primary_site'] == 'Blood Vessel') | 
                      (blood_samples_IDs['_primary_site'] == 'Blood') & (blood_samples_IDs['_sample_type'] != 'Cell Line')]
blood_primary_tumor_IDs = blood_samples_IDs[blood_samples_IDs['_sample_type'].str.match('(Primary Blood Derived Cancer - Bone Marrow)|(Primary Blood Derived Cancer - Peripheral Blood)')]

print('normal →', len(blood_normal_tissue_IDs))
print('primary →', len(blood_primary_tumor_IDs))

In [None]:
print("__Normal blood:\n",blood_normal_tissue_IDs['_primary_site'].value_counts())
print()
print("__Primary tumor blood:\n",blood_primary_tumor_IDs['_sample_type'].value_counts())

In [None]:
#convert IDs to list
normal_samples_labels  = blood_normal_tissue_IDs['sample'].to_list()
primary_samples_labels  = blood_primary_tumor_IDs['sample'].to_list()

In [None]:
%%time
#Download the big expression data
path = input_file_directory + "/xenabrowser_brut_data/TcgaTargetGtex_rsem_gene_tpm.tsv"
df_gexpr_all = pandas.read_csv(path, sep = "\t", index_col='sample')

print(df_gexpr_all.shape)
df_gexpr_all.head(2)

In [None]:
%%time
#remove dot in ENSEMBL name. Function split created at the beginning
#remove dot in ens name
df_gexpr_all = df_gexpr_all.reset_index()
df_gexpr_all["sample"] = df_gexpr_all["sample"].apply(split)
#see if have duplicate
print('number of duplicated genes:', df_gexpr_all.duplicated('ensembl_gene_id').sum())

#Merge with prot coding list
df_gexpr_all.rename(columns = {'sample' : 'ensembl_gene_id'}, inplace = True)
dfmerged = protein_coding_list.merge(df_gexpr_all)
print('number of duplicated genes after merge with codlist:', dfmerged.duplicated('ensembl_gene_id').sum())
print(dfmerged.shape)
dfmerged.head(2)

# WHY DO I HAVE DUPLICATE AFTER MERGE ?

In [None]:
#Control of the merge. See value of ENSG00000198712 before merging
print(df_gexpr_all.loc[df_gexpr_all['ensembl_gene_id'] == 'ENSG00000198712']['GTEX-QV31-1626-SM-2S1QC'])

#Control of the merge. See value of ENSG00000198712 after merging. have to be the same
print(dfmerged.loc[dfmerged['ensembl_gene_id'] == 'ENSG00000198712']['GTEX-QV31-1626-SM-2S1QC'])

In [None]:
%%time
#Isolate Normal and Primary tissues
##keep the gene_ids
df_part1 = dfmerged.iloc[:,0:4]

##subset
normal_protcoding = dfmerged.iloc[:,4:][normal_samples_labels]
primary_protcoding = dfmerged.iloc[:,4:][primary_samples_labels]

normal_protcoding = pandas.concat([df_part1, normal_protcoding], axis=1)
primary_protcoding = pandas.concat([df_part1, primary_protcoding], axis=1)

print(normal_protcoding.shape)
print(primary_protcoding.shape)
normal_protcoding.head(2)

In [None]:
#test if everything is still good after concat and subset
print(dfmerged['GTEX-ZV68-0006-SM-4YCEJ'][12])
print(normal_protcoding['GTEX-ZV68-0006-SM-4YCEJ'][12])

In [None]:
print("Normal tissue")
print("is nan in ensembl:", normal_protcoding['ensembl_gene_id'].isna().sum())
print(' number of duplicate:',normal_protcoding.duplicated('ensembl_gene_id').sum())
print("is nan in symbol:",normal_protcoding['hgnc_symbol'].isna().sum())
print(' number of duplicate:',normal_protcoding.duplicated('hgnc_symbol').sum())
print()
print("Primary Tumor")
print("is nan in ensembl:", primary_protcoding['ensembl_gene_id'].isna().sum())
print(' number of duplicate:',primary_protcoding.duplicated('ensembl_gene_id').sum())
print("is nan in symbol:",primary_protcoding['hgnc_symbol'].isna().sum())
print(' number of duplicate:',primary_protcoding.duplicated('hgnc_symbol').sum())

## B) protcoding_expr file ENSEMBL

In [None]:
#see how many duplicates in genes
print('normal shape:', normal_protcoding.shape)
print('number of duplicate:',normal_protcoding.duplicated('ensembl_gene_id').sum())
print()
print('primary shape is:', primary_protcoding.shape)
print('number of duplicate:',primary_protcoding.duplicated('ensembl_gene_id').sum())
print()

#delete duplicates
normal_protcoding.drop_duplicates(subset = "ensembl_gene_id", inplace = True)
primary_protcoding.drop_duplicates(subset = "ensembl_gene_id", inplace = True)

In [None]:
%%time
#Preparation of the global dataframe
protcoding_final = []
data = [normal_protcoding, primary_protcoding]

for df in data:
    #transpose the table 
    df = numpy.transpose(df)

    #Change head column & delete the name of the column index; choose ENSEMBL
    df.columns = df.iloc[0]
    df.columns.name = None
    df = df.reset_index()

    #Delete the first rows & rename the columns to merge 
    df = df.drop(df.index[0:4])
    df.rename(columns = {'index' : 'sample'}, inplace = True)

    protcoding_final.append(df)
    
ensembl_normal_protcoding_expr = protcoding_final[0]
ensembl_primary_protcoding_expr = protcoding_final[1]

print('normal shape =', ensembl_normal_protcoding_expr.shape)
print('primary shape =', ensembl_primary_protcoding_expr.shape)
ensembl_primary_protcoding_expr.head()

In [None]:
#save to csv
path = 'dataframes_for_input/blood/'
ensembl_normal_protcoding_expr.to_csv(output_file_directory+path+'ensembl_normal_blood_protcoding_expr.tsv',sep = "\t", index=False)
ensembl_primary_protcoding_expr.to_csv(output_file_directory+path+'ensembl_primary_blood_protcoding_expr.tsv',sep = "\t", index=False)

# C) protcoding_expr file SYMBOL

In [None]:
%%time
#Preparation of the global dataframe
protcoding_final = []
data = [normal_protcoding, primary_protcoding]

for df in data:
    df = df.dropna(subset=['hgnc_symbol'])
    #transpose the table 
    df = numpy.transpose(df)

    #Change head column & delete the name of the column index; choose ENSEMBL
    df.columns = df.iloc[1]
    df.columns.name = None
    df = df.reset_index()

    #Delete the first rows & columns rename to merge 
    df = df.drop(df.index[0:4])
    df.rename(columns = {'index' : 'sample'}, inplace = True)

    protcoding_final.append(df)
    
symbol_normal_protcoding = protcoding_final[0]
symbol_primary_protcoding = protcoding_final[1]

print('normal shape =', symbol_normal_protcoding.shape)
print('primary shape =', symbol_primary_protcoding.shape)
symbol_normal_protcoding.head()

In [None]:
#save to csv
path = 'dataframes_for_input/blood/'
symbol_normal_protcoding.to_csv(output_file_directory+path+'symbol_normal_blood_protcoding_expr.tsv',sep = "\t", index=False)
symbol_primary_protcoding.to_csv(output_file_directory+path+'symbol_primary_blood_protcoding_expr.tsv',sep = "\t", index=False)