In [1]:
# Check the gene names
import pandas as pd
import requests

# Read Document 1 (assumes an Excel file with a column "Protein" containing gene names)
df1 = pd.read_excel('/Users/conny/Desktop/AlphaFold/Paper 3 data.xlsx', header=2)

# Now you can access the 'Protein' column
protein_column = df1['Protein IDs'].dropna().unique()
print(protein_column)
#gene_names = df1['Protein'].dropna().unique()  # unique gene names, drop NaNs if any


['Q92522' 'J3QSH4;Q14119;J9JIC7'
 'Q53EW3;B2R5R9;P48382;F8W689;P48382-2;Q9UG77;A0A0A0MSQ2;H0Y4B4;F6UE82;F6R6G4;F6X9D6;A0A0A0MSM9;F6S3S0;A0A0A0MT34;F8WDU3;F8WFE4;F2Z2G0'
 ...
 'A0A0S2Z693;Q96RQ3;E9PHF7;Q68D27;F5GYT8;E9PG35;B4DH16;G5E9X5;B4DHW5;B7ZAW3;F8WDI3'
 'P05165-2;P05165;B2RDE0;P05165-3;A0A1B0GU58;A0A1B0GUX9;A0A1B0GWI4;A0A1B0GWA1;A0A1B0GTR1;H0Y5U0;Q5JTW6;Q16380;Q5JVH2;A0A2P9ASN7'
 'Q60FE6;Q60FE5;P21333-2;P21333;A6NDY9;A0A087WWY3;Q8TES4;Q6NXF2;Q96C61;B4E2F9;B4DTD5;F8WE98;H0Y5C6;Q86TQ3;H0Y5F3;Q2VP91;Q14315-2;Q14315;A0A024R321;O75369-5;O75369-4;O75369-6;O75369-3;O75369-2;O75369-9;O75369;O75369-8;Q59H94']


In [6]:
import requests
import sys
import json
import pandas as pd
from tqdm import tqdm
import time
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import random

# Extract the Protein ID from the column
protein_ids = protein_column

# API Parameters
params = {"fields": ["sequence"]}
headers = {"accept": "application/json"}

# Output file
output_file = "p3_protein_sequences.fasta"
failed_queries_file = "failed_queries.csv"

# Configure retry strategy
retry_strategy = Retry(
    total=3,  # number of retries
    backoff_factor=1,  # wait 1, 2, 4 seconds between retries
    status_forcelist=[429, 500, 502, 503, 504],  # HTTP status codes to retry on
)

# Create a session with retry strategy
session = requests.Session()
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("https://", adapter)
session.mount("http://", adapter)

def fetch_sequence(protein_id, max_retries=3, timeout=30):
    """
    Fetch sequence with retry logic and rate limiting
    """
    for attempt in range(max_retries):
        try:
            # Add random delay between requests (0.5 to 2 seconds)
            time.sleep(random.uniform(0.5, 2))
            
            url = f"https://rest.uniprot.org/uniprotkb/{protein_id}"
            response = session.get(
                url, 
                headers=headers, 
                params=params,
                timeout=timeout
            )
            
            if response.ok:
                data = response.json()
                sequence_info = data.get("sequence", {})
                return sequence_info.get("value", None)
            elif response.status_code == 429:  # Too Many Requests
                retry_after = int(response.headers.get('Retry-After', 60))
                print(f"Rate limit hit, waiting {retry_after} seconds...", file=sys.stderr)
                time.sleep(retry_after)
                continue
            else:
                print(f"Warning: Failed to fetch {protein_id} (Status code: {response.status_code})", file=sys.stderr)
                return None
                
        except requests.exceptions.Timeout:
            if attempt < max_retries - 1:
                wait_time = (2 ** attempt) + random.uniform(0, 1)
                print(f"Timeout for {protein_id}, retrying in {wait_time:.2f} seconds...", file=sys.stderr)
                time.sleep(wait_time)
                continue
            else:
                print(f"Warning: Timeout after {max_retries} attempts for {protein_id}", file=sys.stderr)
                return None
        except requests.exceptions.RequestException as e:
            print(f"Warning: Request failed for {protein_id}: {str(e)}", file=sys.stderr)
            return None
    
    return None

# Track failed queries
failed_queries = []

results = {}
# Wrap tqdm around the outer loop
for entry in tqdm(protein_ids, desc="Fetching sequences"):
    protein_ids_split = entry.split(";")
    for protein_id in protein_ids_split:
        protein_id = protein_id.strip()
        if protein_id and protein_id not in results:
            sequence = fetch_sequence(protein_id)
            if sequence:
                results[protein_id] = sequence
            else:
                results[protein_id] = "Sequence Not Found"
                failed_queries.append(protein_id)

# Save successful sequences
with open(output_file, "w") as file:
    for protein_id, sequence in results.items():
        file.write(f">{protein_id}\n{sequence}\n")

# Save failed queries to CSV
if failed_queries:
    pd.DataFrame({"protein_id": failed_queries}).to_csv(failed_queries_file, index=False)
    print(f"\nFailed queries saved to {failed_queries_file}")

print(f"Protein sequences saved to {output_file}")
print(f"Successfully retrieved: {len(results) - len(failed_queries)} sequences")
print(f"Failed to retrieve: {len(failed_queries)} sequences")

Fetching sequences: 100%|██████████| 1553/1553 [4:59:28<00:00, 11.57s/it]  


Failed queries saved to failed_queries.csv
Protein sequences saved to p3_protein_sequences.fasta
Successfully retrieved: 11846 sequences
Failed to retrieve: 693 sequences



