In [5]:
import requests
import csv
import time
import pandas as pd

In [6]:
df =pd.read_json("/Users/quanhuynh/Documents/HackHarvard/hh-enzymegen/reactions2core_to_core.json")
df.head()

Unnamed: 0,reaction_core,id
0,O=[PH](OCC1CCC(N2C=CCC=C2)O1)O[PH](=O)OCC1CCC(...,RHEA:35547
1,O=C1NC(Cc2ccccc2)C(=O)NC1Cc1ccccc1>>O=C1NC2Cc3...,RHEA:35547
2,O=[PH](OCC1CCC(n2cnc3cncnc32)O1)O[PH](=O)OCC1C...,RHEA:11540
3,O=c1[nH]c2c(c(=O)[nH]1)Nc1ccccc1N2>>O=c1nc2[nH...,RHEA:51264
4,c1ccc(CCNCc2ccccc2)cc1>>O=C1C=CC23CCN(Cc4ccccc...,RHEA:51264


In [7]:
df['id'] = df['id'].str.replace('RHEA:', '')
df.head()

Unnamed: 0,reaction_core,id
0,O=[PH](OCC1CCC(N2C=CCC=C2)O1)O[PH](=O)OCC1CCC(...,35547
1,O=C1NC(Cc2ccccc2)C(=O)NC1Cc1ccccc1>>O=C1NC2Cc3...,35547
2,O=[PH](OCC1CCC(n2cnc3cncnc32)O1)O[PH](=O)OCC1C...,11540
3,O=c1[nH]c2c(c(=O)[nH]1)Nc1ccccc1N2>>O=c1nc2[nH...,51264
4,c1ccc(CCNCc2ccccc2)cc1>>O=C1C=CC23CCN(Cc4ccccc...,51264


In [8]:
data = []
start_time = time.time()  # Record the start time
total_rows = len(df)

In [11]:
data = []
batch_size = 5  
rhea_ids = df['id'].unique()

start_time = time.time() 
total_batches = (len(rhea_ids) + batch_size - 1) // batch_size  
batch_sleep = 1

for batch_index in range(0, len(rhea_ids), batch_size):
    batch = rhea_ids[batch_index:batch_index + batch_size]
    query = ' OR '.join([f"rhea:{rhea_id}" for rhea_id in batch])
    
    url = f"https://rest.uniprot.org/uniprotkb/search?query={query}&format=tsv&fields=accession,ec,sequence"
    print(f"Processing batch {batch_index // batch_size + 1} of {total_batches}: {batch}")
    
    try:
        response = requests.get(url)
        response.raise_for_status()
        
        lines = response.text.splitlines()
        headers = lines[0].split('\t')
        
        for line in lines[1:]:
            fields = line.split('\t')
            rhea_data = dict(zip(headers, fields))
            
            accession = rhea_data.get("Entry", "N/A")
            ec_number = rhea_data.get("EC number", "N/A")
            sequence = rhea_data.get("Sequence", "N/A")
            
            data.append({
                "id": batch[0],
                "accession": accession,
                "ec_number": ec_number,
                "sequence": sequence
            })
        
        completed_batches = (batch_index // batch_size) + 1 #time :)
        elapsed_time = time.time() - start_time
        avg_time_per_batch = elapsed_time / completed_batches
        remaining_time = avg_time_per_batch * (total_batches - completed_batches)
        print(f"Estimated time remaining: {remaining_time // 60:.0f} minutes {remaining_time % 60:.0f} seconds")
        
        time.sleep(batch_sleep)
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching batch {batch}: {e}")
        retries = 3
        for attempt in range(retries):
            time.sleep(batch_sleep * (attempt + 1))  # Exponential backoff
            try:
                response = requests.get(url)
                response.raise_for_status()
                break
            except requests.exceptions.RequestException as retry_e:
                print(f"Retry {attempt + 1}/{retries} failed for batch {batch}: {retry_e}")
        else:
            print(f"Failed to fetch data for batch {batch} after {retries} attempts. Skipping.")
            continue  \

# Convert the list to a DataFrame and save to CSV
result_df = pd.DataFrame(data)
print("Final DataFrame:")
print(result_df.head())

# Save to CSV
result_df.to_csv('rhea_data.csv', index=False)
print("Data written to rhea_data.csv")

Processing batch 1 of 836: ['35547' '11540' '51264' '51260' '48896']
Estimated time remaining: 7 minutes 8 seconds
Processing batch 2 of 836: ['25062' '25066' '43876' '43872' '27373']
Estimated time remaining: 15 minutes 12 seconds
Processing batch 3 of 836: ['46136' '46132' '25286' '36251' '25282']
Estimated time remaining: 17 minutes 27 seconds
Processing batch 4 of 836: ['16253' '16257' '16705' '51848' '16701']
Estimated time remaining: 18 minutes 22 seconds
Processing batch 5 of 836: ['22484' '22480' '52468' '26397' '26393']
Estimated time remaining: 19 minutes 5 seconds
Processing batch 6 of 836: ['52460' '46700' '52464' '17257' '10708']
Estimated time remaining: 19 minutes 26 seconds
Processing batch 7 of 836: ['53732' '10700' '53736' '21616' '20804']
Estimated time remaining: 19 minutes 53 seconds
Processing batch 8 of 836: ['45112' '45116' '24088' '37795' '24080']
Estimated time remaining: 20 minutes 9 seconds
Processing batch 9 of 836: ['37791' '24084' '48260' '36847' '30411']

In [15]:
merged_df = pd.merge(result_df, df[['id', 'reaction_core']], on='id', how='left')
merged_df.head()


Unnamed: 0,id,accession,ec_number,sequence,reaction_core
0,35547,P9WPP7,1.14.19.70,MTATVLLEVPFSARGDRIPDAVAELRTREPIRKVRTITGAEAWLVS...,O=[PH](OCC1CCC(N2C=CCC=C2)O1)O[PH](=O)OCC1CCC(...
1,35547,P9WPP7,1.14.19.70,MTATVLLEVPFSARGDRIPDAVAELRTREPIRKVRTITGAEAWLVS...,O=C1NC(Cc2ccccc2)C(=O)NC1Cc1ccccc1>>O=C1NC2Cc3...
2,35547,A0A140IL90,1.14.19.50,MATSSSAWLMFSDHYPEILIAIACFLIFSLLLSARSSSEDSLPYNW...,O=[PH](OCC1CCC(N2C=CCC=C2)O1)O[PH](=O)OCC1CCC(...
3,35547,A0A140IL90,1.14.19.50,MATSSSAWLMFSDHYPEILIAIACFLIFSLLLSARSSSEDSLPYNW...,O=C1NC(Cc2ccccc2)C(=O)NC1Cc1ccccc1>>O=C1NC2Cc3...
4,35547,A0A2H5AIZ9,1.14.19.50,MATSSSAWLMFSDHYPEILIAIACFLIFSLLLSARSSSKDSLPYNW...,O=[PH](OCC1CCC(N2C=CCC=C2)O1)O[PH](=O)OCC1CCC(...


In [16]:
merged_df.to_csv('merged.csv', index = False)