# Standarization of Datasets
The following notebook is intended for the standarization of the two Network Reconstruction datasets (the one generated in this reconstruction effort and the one that it is been manually curated). The idea of this notebook is to have the first dataset as back in case something goes bad in the manually curated dataset.

In [1]:
from google_sheet import GoogleSheet

### 1.Generate the two sets of datasets

In [2]:
#Credential file
KEY_FILE_PATH = 'credentials.json'

# Read data from the Google Sheet (Sheet Names)
sheet_rxns = 'Rxns'
sheet_attributes = 'Attributes'
sheet_boundary = 'BoundaryRxns'
sheet_genes = 'Genes'
sheet_met = 'Metabolites'

In [3]:
##### ----- Generate datasets from CHO Network Reconstruction + Recon3D_v3 ----- #####

#CHO Network Reconstruction + Recon3D_v3 Google Sheet ID
SPREADSHEET_ID = '1MlBXeHIKw8k8fZyXm-sN__AHTRSunJxar_-bqvukZws'

# Initialize the GoogleSheet object
sheet = GoogleSheet(SPREADSHEET_ID, KEY_FILE_PATH)


rxns_v3 = sheet.read_google_sheet(sheet_rxns)
rxns_attributes_v3 = sheet.read_google_sheet(sheet_attributes)
boundary_rxns_v3 = sheet.read_google_sheet(sheet_boundary)
genes_v3 = sheet.read_google_sheet(sheet_genes)
metabolites_v3 = sheet.read_google_sheet(sheet_met)

In [4]:
##### ----- Generate datasets from Chinese Hamster Network Reconstruction ----- #####

#CHO Network Reconstruction + Recon3D_v3 Google Sheet ID
SPREADSHEET_ID = '1_bCHi0YbemnalomhVDmeHRJimxcVNI4GK6MoH3JdE8M'

# Initialize the GoogleSheet object
sheet = GoogleSheet(SPREADSHEET_ID, KEY_FILE_PATH)

rxns_mc = sheet.read_google_sheet(sheet_rxns)
rxns_attributes_mc = sheet.read_google_sheet(sheet_attributes)
boundary_rxns_mc = sheet.read_google_sheet(sheet_boundary)
genes_mc = sheet.read_google_sheet(sheet_genes)
metabolites_mc = sheet.read_google_sheet(sheet_met)

### 2.Check if the datasets are equal

In [5]:
### ------- "Rxns Sheet" ------- ####
rxns_equals = rxns_v3.equals(rxns_mc)
print(f'"Rxns" Sheet in both datasets are equal: {rxns_v3.equals(rxns_mc)}')
if not rxns_equals:
    rxns_differences = rxns_v3.compare(rxns_mc)
    rxns_differences['Reaction'] = rxns_v3['Reaction'][rxns_differences.index.to_list()]
    print(rxns_differences)
print('.............................................................')    
rxns_attributes_equals = rxns_attributes_v3.equals(rxns_attributes_mc)    
print(f'"Attributes" Sheet in both datasets are equal: {rxns_attributes_v3.equals(rxns_attributes_mc)}')

### ------- "Attributes Sheet" ------- ####
if not rxns_attributes_equals:
    rxns_attributes_differences = rxns_attributes_v3.compare(rxns_attributes_mc)
    rxns_attributes_differences['Reaction'] = rxns_attributes_v3['Reaction'][rxns_attributes_differences.index.to_list()]
    print(rxns_attributes_differences)
print('.............................................................')      
boundary_rxns_equals = boundary_rxns_v3.equals(boundary_rxns_mc)     
print(f'"BoundaryRxns" Sheet in both datasets are equal: {boundary_rxns_v3.equals(boundary_rxns_mc)}')

### ------- "Boundary Rxns Sheet" ------- ####
if not boundary_rxns_equals:
    boundary_rxns_differences = boundary_rxns_v3.compare(boundary_rxns_mc)
    boundary_rxns_differences['Reaction'] = boundary_rxns_v3['Reaction'][boundary_rxns_differences.index.to_list()]
    print(boundary_rxns_differences)
print('.............................................................') 

### ------- "Genes Sheet" ------- ####
genes_equals = genes_v3.equals(genes_mc) 
print(f'"Genes" Sheet in both datasets are equal: {genes_v3.equals(genes_mc)}')
if not genes_equals:
    genes_differences = genes_v3.compare(genes_mc)
    genes_differences['Gene Entrez ID'] = genes_v3['Gene Entrez ID'][genes_differences.index.to_list()]
    print(genes_differences)
print('.............................................................')  

### ------- "Metabolites Sheet" ------- ####
metabolites_equals = metabolites_v3.equals(metabolites_mc) 
print(f'"Metabolites" Sheet in both datasets are equal: {metabolites_v3.equals(metabolites_mc)}')
if not metabolites_equals:
    metabolites_differences = metabolites_v3.compare(metabolites_mc)
    metabolites_differences['BiGG ID'] = metabolites_v3['BiGG ID'][metabolites_differences.index.to_list()]
    print(metabolites_differences)

"Rxns" Sheet in both datasets are equal: False
     Curated                                             Reaction Formula  \
        self     other                                               self   
5017          SCH/ALSA                                                NaN   
5018          SCH/ALSA                                                NaN   
5019          SCH/ALSA                                                NaN   
5021          SCH/ALSA                                                NaN   
5022          SCH/ALSA                                                NaN   
5023          SCH/ALSA                                                NaN   
5024          SCH/ALSA                                                NaN   
5025          SCH/ALSA                                                NaN   
5026          SCH/ALSA                                                NaN   
5027          SCH/ALSA                                                NaN   
5028          SCH/ALSA       

### 3.Update the v3 dataset with info from the manual curation dataset

In [6]:
# Update Rxns dataset
rxns_changed = False
if not rxns_equals:
    for index in rxns_differences.index:
        rxns_v3.loc[index].update(rxns_mc.loc[index])
        rxns_changed = True
else:
    print('No need to update "Rxns" Sheet datasets')

# Update Attributes dataset
attributes_changed = False
if not rxns_attributes_equals:
    for index in rxns_attributes_differences.index:
        rxns_attributes_v3.loc[index].update(rxns_attributes_mc.loc[index])
        attributes_changed = True
else:
    print('No need to update "Attributes" Sheet datasets')    

# Update Metabolites dataset
metabolites_changed = False
if not metabolites_equals:
    for index in metabolites_differences.index:
        metabolites_v3.loc[index].update(metabolites_mc.loc[index])
        metabolites_changed = True
else:
    print('No need to update "Metabolites" Sheet datasets')

No need to update "Attributes" Sheet datasets


In [7]:
#############################################
#### ----------------------------------- ####
#### ---- Update Rxns Google Sheet ----- ####
#### ----------------------------------- ####
#############################################

#CHO Network Reconstruction + Recon3D_v3 Google Sheet ID
SPREADSHEET_ID = '1MlBXeHIKw8k8fZyXm-sN__AHTRSunJxar_-bqvukZws'

# Initialize the GoogleSheet object
sheet = GoogleSheet(SPREADSHEET_ID, KEY_FILE_PATH)

if rxns_changed:
    sheet.update_google_sheet(sheet_rxns, rxns_v3)
    print("Rxns Sheet updated.")

if attributes_changed:
    sheet.update_google_sheet(sheet_attributes, rxns_attributes_v3)
    print("Attributes Sheet updated.")

if metabolites_changed:
    sheet.update_google_sheet(sheet_met, metabolites_v3)
    print("Metabolites Sheet updated.")

Rxns Sheet updated.
Metabolites Sheet updated.
