# Simulating Metabolism for CGGA Validation Data

### Using Preferred iMAT params (determined from TCGA CV) - corresponds to Dataset 8

##### iMAT Parameters:
9. UQ:10%, LQ:90%, epsilon:20, threshold:10

Downloading Model

In [1]:
from pyGSLModel import download_GSL_model

model = download_GSL_model()

print(f"Number of Reactions in model : {len(model.reactions)}")
print(f"Number of Metabolites in model : {len(model.metabolites)}")
print(f"Number of Genes in model : {len(model.genes)}")

print(f"Checking gene symbol conversion :")
model.genes.get_by_id("UGT8")

Downloading  and Reading in Model
Model succesfully downloaded and read in.
Number of Reactions in model : 2312
Number of Metabolites in model : 2015
Number of Genes in model : 2887
Checking gene symbol conversion :


0,1
Gene identifier,UGT8
Name,G_UGT8
Memory address,0x21689fd65d0
Functional,True
In 2 reaction(s),"MAR00920, MAR00919"


Performing simulation against each row in the CGGA dataset

In [2]:
import pandas as pd
CGGA_df = pd.read_csv("./CGGA_Data/CGGA_LGG_TPM.csv")
TCGA_df = pd.read_csv("./TCGA_Data/TCGA_Data.tsv", sep="\t")

In [3]:
# Veryifying input columns are identical between training TCGA data and CGGA validation data
TCGA_df.drop(columns=['sample', 'samples', 'detailed_category', 'DFI', 'DFI.time', 'DSS',
       'DSS.time', '_gender', 'TCGA_GTEX_main_category', 'OS', 'OS.time',
       'PFI', 'PFI.time', 'primary disease or tissue', '_primary_site',
       '_sample_type', '_study']).columns.to_list() == CGGA_df.drop(columns=['CGGA_ID', 'OS.time', 'OS']).columns.to_list()

True

In [4]:
CGGA_input = CGGA_df.set_index('CGGA_ID').drop(columns=['OS.time', 'OS']).T.copy()
CGGA_input.head()

CGGA_ID,CGGA_1002,CGGA_1003,CGGA_1010,CGGA_1012,CGGA_1014,CGGA_1018,CGGA_103,CGGA_1030,CGGA_1032,CGGA_1033,...,CGGA_P594,CGGA_P604,CGGA_P615,CGGA_P623,CGGA_P633,CGGA_P83,CGGA_P84,CGGA_P86,CGGA_P93,CGGA_P98
A4GALT,2.285539,-0.247103,-6.695296,2.407788,1.88103,0.546921,0.961196,0.082702,-3.297098,1.294846,...,1.71702,2.045651,1.575144,2.227588,1.571009,1.790154,1.119619,0.689371,1.921136,1.485744
ABO,-2.414404,-9.965784,-9.965784,-2.688687,-4.832545,-0.386091,-2.763073,-3.940729,-7.309998,-2.261743,...,-2.038293,-1.92757,-1.656003,-5.3493,-1.995962,-4.240188,-1.312395,-0.540205,-2.138966,-3.880763
B3GALNT1,3.698233,0.880292,-7.207377,2.318283,3.338593,1.877243,4.24287,4.081424,-2.126697,3.109685,...,4.863377,5.072331,4.980939,3.863384,4.994158,1.299688,4.740559,4.254572,4.520952,3.354008
B3GALT1,1.467149,-0.058867,-5.56097,-0.199934,1.302925,-1.024864,2.195506,1.469312,-5.648528,1.380968,...,2.822221,0.875073,0.925166,1.075156,1.274502,-0.358972,2.273724,-0.454596,1.469896,0.725973
B3GALT4,2.676142,1.475185,-5.727295,2.440336,2.234582,2.232741,2.643495,1.911798,-1.173549,2.917739,...,2.935779,1.252648,3.127511,2.39183,1.416221,1.682271,1.215154,2.577937,2.690939,2.63278


In [5]:
from pyGSLModel import iMAT_multi_integrate
CGGA_simualted = iMAT_multi_integrate(model=model, data=CGGA_input, upper_quantile=0.1, lower_quantile=0.9, epsilon=20, threshold=10)

Simulations Performed:1/443
Simulations Performed:2/443
Simulations Performed:3/443
Simulations Performed:4/443
Simulations Performed:5/443
Simulations Performed:6/443
Simulations Performed:7/443
Simulations Performed:8/443
Simulations Performed:9/443
Simulations Performed:10/443
Simulations Performed:11/443
Simulations Performed:12/443
Simulations Performed:13/443
Simulations Performed:14/443
Simulations Performed:15/443
Simulations Performed:16/443
Simulations Performed:17/443
Simulations Performed:18/443
Simulations Performed:19/443
Simulations Performed:20/443
Simulations Performed:21/443
Simulations Performed:22/443
Simulations Performed:23/443
Simulations Performed:24/443
Simulations Performed:25/443
Simulations Performed:26/443
Simulations Performed:27/443
Simulations Performed:28/443
Simulations Performed:29/443
Simulations Performed:30/443
Simulations Performed:31/443
Simulations Performed:32/443
Simulations Performed:33/443
Simulations Performed:34/443
Simulations Performed:3

In [6]:
CGGA_simulated_df = CGGA_simualted[0]
CGGA_simulated_df.head()

Unnamed: 0_level_0,D-galactosyl-N-acylsphingosine,GA1,GA2,GD1a,GD1b,GD1c,GD2,GD3,GM1,GM1b,...,sulfatide galactocerebroside,type I H glycolipid,type II H glycolipid,(neo)lacto-series,0-series(ganglio),a-series(ganglio),b-series(ganglio),c-series(ganglio),gal-series,globo-series
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CGGA_1002,0.0,37.997344,37.997344,20.0,20.0,0.0,20.0,40.0,20.0,37.997344,...,0.0,0.0,0.0,0.0,113.992031,120.0,100.0,80.0,0.0,0.002656
CGGA_1003,0.0,20.0,20.0,20.0,20.0,0.0,20.0,40.0,20.0,20.0,...,0.0,0.0,0.0,40.0,60.0,120.0,100.0,80.0,0.0,8.0
CGGA_1010,0.0,20.0,20.0,20.0,20.0,0.0,20.0,40.0,20.0,20.0,...,0.0,0.0,0.0,0.0,60.0,120.0,100.0,80.0,0.0,80.0
CGGA_1012,20.0,0.0,0.0,48.966216,20.0,0.0,20.0,20.0,48.966216,0.0,...,20.0,0.0,0.0,0.0,0.0,215.864862,80.0,0.0,40.0,0.0
CGGA_1014,3.2564390000000003e-31,20.0,20.0,20.0,20.0,0.0,20.0,40.0,20.0,20.0,...,3.2564390000000003e-31,4.625469e-16,-2.950001e-15,6.702703,60.0,120.0,100.0,80.0,6.512878000000001e-31,0.648649


In [7]:
CGGA_Surv = CGGA_df[['CGGA_ID', 'OS', 'OS.time']].copy()
CGGA_Surv = CGGA_Surv.set_index('CGGA_ID')
CGGA_Surv.head()

Unnamed: 0_level_0,OS,OS.time
CGGA_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
CGGA_1002,1.0,305.0
CGGA_1003,0.0,3817.0
CGGA_1010,1.0,246.0
CGGA_1012,1.0,3679.0
CGGA_1014,1.0,263.0


In [8]:
CGGA_Tidied_Data = pd.merge(CGGA_input.T,CGGA_simulated_df,left_index=True,right_index=True)
CGGA_Tidied_Data = pd.merge(CGGA_Surv, CGGA_Tidied_Data, right_index=True, left_index=True)
CGGA_Tidied_Data["S_temp"] = CGGA_Tidied_Data["OS"] * CGGA_Tidied_Data["OS.time"]
CGGA_Tidied_Data["Srv"] = ((CGGA_Tidied_Data["S_temp"] >= 1) & (CGGA_Tidied_Data["S_temp"] < (5*365))).astype(int)
CGGA_Tidied_Data = CGGA_Tidied_Data.drop(columns=["S_temp"])
CGGA_Tidied_Data = CGGA_Tidied_Data.rename(columns={'OS':'DSS', 'OS.time':'DSS.time'})

Checking validation CGGA data matches columns for TCGA training data

In [17]:
DATA_URL = (f"https://raw.githubusercontent.com/JackWJW/LGG_Prognosis_Prediction/main/Tidied_Datasets/tidied_integrated_df_9.csv")
training_data = pd.read_csv(DATA_URL).drop(columns=["Unnamed: 0"])
training_data.columns = training_data.columns.str.rstrip('_9')
training_data = training_data.rename(columns={'FUT':'FUT9'})

In [19]:
CGGA_Tidied_Data.columns.to_list() == training_data.columns.to_list()

True

Saving CGGA Tidied Data

In [20]:
CGGA_Tidied_Data.to_csv("./CGGA_Data/CCGA_Tidied_Integrated.csv")