In [1]:
import pandas as pd
import os
import glob
from pathlib import Path
import numpy as np

In [3]:
#Upload the Reactome to ENSEMBL database and get the data for human
reactome= pd.read_csv("/Users/sebastiansolano/Documents/AI_projects/HANN_Oncogene/Reactome_for_hyperedges/Ensembl2Reactome_All_Levels.txt", sep='\t')
reactome.columns = ['Gene','Pathway','Pathway_link','Description',"Evidence","Especie"]
reactome_human =  reactome[reactome["Especie"] == 'Homo sapiens']

# Create the unified transcriptomic Files

In [5]:
#Get the list of transcriptomic file
path = '/Users/sebastiansolano/Documents/AI_projects/HANN_Oncogene/TCGA_project_data_for_training/CPTAC-3_Brain-nos_transcriptome_profiling'
file_pattern = os.path.join(path, '*.tsv')
all_files_rnaseq = glob.glob(file_pattern)

In [6]:
#The following code Merge the files in one single file
list_transcriptomic_brain_tumor=[]
for filename in all_files_rnaseq:
    df = pd.read_csv(filename,sep='\t')
    df.columns=['gene_id','gene_name','gene_type','unstranded','stranded_first','stranded_second','tpm_unstranded','fpkm_unstranded','fpkm_uq_unstranded']
    df['ensembl_id']= df['gene_id'].apply(lambda x: x.split(".")[0])
    df_filter=df
    
    dataframe_to_list=df_filter[['ensembl_id','unstranded']]
    name_in_colum= Path(filename).stem
    dataframe_to_list=dataframe_to_list.rename(columns={'unstranded':name_in_colum})
    dataframe_to_list= dataframe_to_list.sort_values(by='ensembl_id')
    dataframe_to_list= dataframe_to_list.set_index('ensembl_id')
    dataframe_to_list= dataframe_to_list.groupby(level=0).sum()
    list_transcriptomic_brain_tumor.append(dataframe_to_list)

#merge 
merged_df = pd.concat(list_transcriptomic_brain_tumor, axis=1, join='inner')

In [None]:
#There are some genes from Reactome databse that were not cover by the transcirptomic analysis
not_in_lista =[]
for x in reactome_human['Gene']:
    if x not in merged_df.index:
        not_in_lista.append(x)

print(not_in_lista)

In [10]:
#We must be sure to analyse the network with the same ammout of genes, that is why we again filter the Reactome database to obtain genes present in both datasets
final_reactome_toHYperedges = reactome_human[reactome_human['Gene'].isin(merged_df.index)]

In [12]:
#save files
final_reactome_toHYperedges.to_csv('/Users/sebastiansolano/Documents/AI_projects/HANN_Oncogene/Reactome_for_hyperedges/reactome_patways.csv')
merged_df.to_csv('/Users/sebastiansolano/Documents/AI_projects/HANN_Oncogene/TCGA_project_data_for_training/CPTAC-3_Brain-nos_transcriptome_profiling/unified_transcriptomic.csv')

# Create the Incidence Matrix and Degree matrices

In [8]:
#Load transcriptome file
merged_df= pd.read_csv('/Users/sebastiansolano/Documents/AI_projects/HANN_Oncogene/TCGA_project_data_for_training/CPTAC-3_Brain-nos_transcriptome_profiling/unified_transcriptomic.csv',index_col='ensembl_id')
final_reactome_toHYperedges = pd.read_csv('/Users/sebastiansolano/Documents/AI_projects/HANN_Oncogene/Reactome_for_hyperedges/reactome_patways.csv')

In [14]:
#Incidence matrix
incidence_matrix = pd.DataFrame(index=merged_df.index, columns=list(set(final_reactome_toHYperedges['Pathway'])))
incidence_matrix = incidence_matrix.fillna(0)

#Fill up the matrix
for z in merged_df.index:
    sub_set= final_reactome_toHYperedges[final_reactome_toHYperedges['Gene']==z]
    list_pathways= sub_set['Pathway'].to_list()
    for c in list_pathways:
        incidence_matrix.loc[z,c]=1

  incidence_matrix = incidence_matrix.fillna(0)


# Build the Variant Vector for Each Gene

In [5]:
#Import the transcriptomic file and the variant file
Genes= pd.read_csv('/Users/sebastiansolano/Documents/AI_projects/HANN_Oncogene/TCGA_project_data_for_training/CPTAC-3_Brain-nos_transcriptome_profiling/unified_transcriptomic.csv')
variant = pd.read_csv('/Users/sebastiansolano/Documents/AI_projects/HANN_Oncogene/TCGA_project_data_for_training/CPTAC-3_Brain-nos_sinlge_nucleotide_variation/unified_mutation.csv')


In [None]:
#Create dataframe filled up with zeros
Variant_dataframe_values= pd.DataFrame(index=Genes['ensembl_id'], columns=list(set(variant['Variant_Classification'])))
Variant_dataframe_values=Variant_dataframe_values.fillna(0)

In [7]:
#fill up the dataframe with the freqeuncy of each kind of variant clasiffication per each gene

for gene in Variant_dataframe_values.index:
    sub_set= variant[variant['Gene'] == gene]
    for var in Variant_dataframe_values.columns:
        sub_path= sub_set[sub_set['Variant_Classification']==var]
        frequency= sub_path.shape[0]
        Variant_dataframe_values.loc[gene,var]=frequency



# Normalize the transcriptomic Dataset and Merge with the Variant dataset to set the final vector matrix

In [4]:
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats

In [9]:
#Create Metadata 
metadata = pd.DataFrame({
    'condition': ['control'] * merged_df.shape[1]
}, index=merged_df.columns)

In [None]:
# Initialize DeseqDataSet
dds = DeseqDataSet(
    counts=merged_df.T,
    metadata=metadata,
    design_factors='condition',  # your experimental design
    refit_cooks=True
)

  dds = DeseqDataSet(


In [None]:
#Normalize samples
dds.fit_size_factors()
dds.fit_genewise_dispersions()
dds.vst()

vst_counts = dds.layers['vst_counts']

In [22]:
# Convert back to DataFrame (still samples x genes)
normalized_df = pd.DataFrame(
    vst_counts,
    index=merged_df.columns,  # sample names
    columns= merged_df.index  # gene names
).T

# Merge Variant Calling and Transcriptome Profile

In [25]:
#Merge Variant Calling and Transcriptome profile
Feature_dataframe = Variant_dataframe_values.join(normalized_df,how='inner')

In [31]:
#Make both incidence and feature matrices have the same row order

Index_order=Feature_dataframe.index.to_list()
incidence_matrix=incidence_matrix.loc[Index_order]

In [None]:
#Save incidence Matrix and Feature Matrix
Feature_dataframe.to_csv("/Users/sebastiansolano/Documents/AI_projects/HANN_Oncogene/Input_HGNN/Feature_matrix.csv")
incidence_matrix.to_csv("/Users/sebastiansolano/Documents/AI_projects/HANN_Oncogene/Input_HGNN/Incidence_matrix.csv")