In [1]:
import requests
import csv
import time
import pandas as pd

In [2]:
df =pd.read_json("/Users/quanhuynh/Documents/HackHarvard/reactions2core_to_core.json")
df.head()

Unnamed: 0,reaction_core,id
0,O=[PH](OCC1CCC(N2C=CCC=C2)O1)O[PH](=O)OCC1CCC(...,RHEA:35547
1,O=C1NC(Cc2ccccc2)C(=O)NC1Cc1ccccc1>>O=C1NC2Cc3...,RHEA:35547
2,O=[PH](OCC1CCC(n2cnc3cncnc32)O1)O[PH](=O)OCC1C...,RHEA:11540
3,O=c1[nH]c2c(c(=O)[nH]1)Nc1ccccc1N2>>O=c1nc2[nH...,RHEA:51264
4,c1ccc(CCNCc2ccccc2)cc1>>O=C1C=CC23CCN(Cc4ccccc...,RHEA:51264


In [3]:
df['id'] = df['id'].str.replace('RHEA:', '')
df.head()

Unnamed: 0,reaction_core,id
0,O=[PH](OCC1CCC(N2C=CCC=C2)O1)O[PH](=O)OCC1CCC(...,35547
1,O=C1NC(Cc2ccccc2)C(=O)NC1Cc1ccccc1>>O=C1NC2Cc3...,35547
2,O=[PH](OCC1CCC(n2cnc3cncnc32)O1)O[PH](=O)OCC1C...,11540
3,O=c1[nH]c2c(c(=O)[nH]1)Nc1ccccc1N2>>O=c1nc2[nH...,51264
4,c1ccc(CCNCc2ccccc2)cc1>>O=C1C=CC23CCN(Cc4ccccc...,51264


In [4]:
data = []


In [None]:
# Loop through each row directly to process and append data
for index, row in df.iterrows():
    # Extract both 'id' and 'reaction_core' directly from the row
    rhea_id = row['id']
    reaction_core = row['reaction_core']
    
    # Construct the API URL
    url = f"https://rest.uniprot.org/uniprotkb/search?query=rhea:{rhea_id}&format=tsv&fields=accession,ec,sequence"
    print(f"Processing RHEA ID: {rhea_id} with reaction_core: {reaction_core}")
    
    try:
        response = requests.get(url)
        response.raise_for_status()  
        
        lines = response.text.splitlines()
        headers = lines[0].split('\t')
        
        for line in lines[1:]:
            fields = line.split('\t')
            rhea_data = dict(zip(headers, fields))
            
            accession = rhea_data.get("Entry", "N/A")
            ec_number = rhea_data.get("EC number", "N/A")
            sequence = rhea_data.get("Sequence", "N/A")
            print(f"Extracted - Accession: {accession}, EC Number: {ec_number}, Sequence: {sequence[:30]}...")

            data.append({
                "id": f"RHEA:{rhea_id}",
                "reaction_core": reaction_core,
                "accession": accession,
                "ec_number": ec_number,
                "sequence": sequence
            })
        
        # Respectful delay to avoid overloading the server
        time.sleep(1)
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for RHEA:{rhea_id}: {e}")

# Convert the list to a DataFrame and save to CSV
result_df = pd.DataFrame(data)
print("Final DataFrame:")
print(result_df.head())

# Save to CSV
result_df.to_csv('rhea_data_with_reaction_core.csv', index=False)
print("Data written to rhea_data_with_reaction_core.csv")

Processing RHEA ID: 35547 with reaction_core: O=[PH](OCC1CCC(N2C=CCC=C2)O1)O[PH](=O)OCC1CCC(n2cnc3cncnc32)O1>>O=[PH](OCC1CCC(n2cnc3cncnc32)O1)O[PH](=O)OCC1CCC([n+]2ccccc2)O1
Extracted - Accession: P9WPP7, EC Number: 1.14.19.70, Sequence: MTATVLLEVPFSARGDRIPDAVAELRTREP...
Extracted - Accession: P9WPP6, EC Number: 1.14.19.70, Sequence: MTATVLLEVPFSARGDRIPDAVAELRTREP...
Processing RHEA ID: 35547 with reaction_core: O=C1NC(Cc2ccccc2)C(=O)NC1Cc1ccccc1>>O=C1NC2Cc3cccc(c3)-c3cccc(c3)CC1NC2=O
Extracted - Accession: P9WPP7, EC Number: 1.14.19.70, Sequence: MTATVLLEVPFSARGDRIPDAVAELRTREP...
Extracted - Accession: P9WPP6, EC Number: 1.14.19.70, Sequence: MTATVLLEVPFSARGDRIPDAVAELRTREP...
Processing RHEA ID: 11540 with reaction_core: O=[PH](OCC1CCC(n2cnc3cncnc32)O1)O[PH](=O)OCC1CCC([n+]2ccccc2)O1>>O=[PH](OCC1CCC(N2C=CCC=C2)O1)O[PH](=O)OCC1CCC(n2cnc3cncnc32)O1
Extracted - Accession: P22441, EC Number: 1.1.1.233, Sequence: MTTAGVSRRPGRLAGKAAIVTGAAGGIGRA...
Processing RHEA ID: 51264 with reaction_cor