In [1]:
import pandas as pd
import requests
import io
import os
import time
import urllib.parse

def download_and_verify_proteomes(input_csv, data_output_csv, metadata_output_csv):
    """
    Steps 2-5: Reads processed proteome list, downloads sequences from UniProt,
    verifies counts, and saves data to CSV and metadata to a separate CSV.
    """
    
    # --- Step 2: Read the proteome id and organism id from input_file ---
    print(f"Reading input file: {input_csv}...")
    try:
        df = pd.read_csv(input_csv)
    except FileNotFoundError:
        print(f"Error: The file '{input_csv}' was not found.")
        return

    # Check for necessary columns
    required_columns = ['Proteome Id', 'Organism Id', 'Protein count']
    if not all(col in df.columns for col in required_columns):
        print(f"Error: Input file must contain columns: {required_columns}")
        return

    # Initialize Metadata List
    metadata_records = []

    # Initialize Output CSV (Write Header if file doesn't exist)
    # We include 'Entry' (Accession) as it comes with the TSV download and is useful
    header = ["Proteome Id", "Organism Id", "Entry", "Sequence"]
    
    if not os.path.exists(data_output_csv):
        pd.DataFrame(columns=header).to_csv(data_output_csv, index=False)
        print(f"Created new data file: {data_output_csv}")
    else:
        print(f"Appending to existing data file: {data_output_csv}")

    print("Starting download process...")

    # Iterate through each row in the input file
    for index, row in df.iterrows():
        proteome_id = str(row['Proteome Id']).strip()
        organism_id = str(row['Organism Id']).strip()
        expected_count = int(row['Protein count']) if pd.notnull(row['Protein count']) else 0
        
        # --- Step 3: Download protein sequences from UniProtKB in TSV format ---

        # Construct the query URL using both Proteome ID and Organism ID
        # We use 'proteome:ID AND organism_id:ID'
        # We use urllib.parse.quote to ensure the query string is URL-safe
        query_string = f"proteome:{proteome_id} AND organism_id:{organism_id}"
        encoded_query = urllib.parse.quote(query_string)
        # We fetch 'accession' (Entry) and 'sequence'
        query_url = f"https://rest.uniprot.org/uniprotkb/stream?query={encoded_query}&format=tsv&fields=accession,sequence"
        
        try:
            response = requests.get(query_url, stream=True)
            response.raise_for_status()
            
            # Read the TSV data into a pandas DataFrame
            # io.StringIO converts the string output to a file-like object for pandas
            # on_bad_lines='skip' helps avoid crashing on malformed lines
            downloaded_df = pd.read_csv(io.StringIO(response.text), sep='\t')
            
            # Calculate counts
            downloaded_count = len(downloaded_df)
            mismatch_count = abs(expected_count - downloaded_count)
            
            # --- Step 4: Verify counts ---
            status = "Match" if mismatch_count == 0 else "Mismatch"
            print(f"[{index+1}/{len(df)}] {proteome_id}: Exp={expected_count}, Got={downloaded_count} ({status})")

            # --- Step 5: Save the final data to a csv file ---
            # Add the identifier columns to the downloaded data
            downloaded_df.insert(0, 'Proteome Id', proteome_id)
            downloaded_df.insert(1, 'Organism Id', organism_id)
            
            # Rename columns to match our output format if necessary (UniProt gives 'Entry', 'Sequence')
            downloaded_df.columns = ['Proteome Id', 'Organism Id', 'Entry', 'Sequence']
            
            # Append to the main CSV file
            # mode='a' appends, header=False prevents writing the header again
            downloaded_df.to_csv(data_output_csv, mode='a', header=False, index=False)

            # Record Metadata
            metadata_records.append({
                'Proteome Id': proteome_id,
                'Organism Id': organism_id,
                'Expected Count': expected_count,
                'Downloaded Count': downloaded_count,
                'Mismatch Count': mismatch_count,
                'Status': status,
                'Error': 'None'
            })

        except Exception as e:
            print(f"[{index+1}/{len(df)}] {proteome_id}: Error - {e}")
            
            # Log the error in metadata
            metadata_records.append({
                'Proteome Id': proteome_id,
                'Organism Id': organism_id,
                'Expected Count': expected_count,
                'Downloaded Count': 0,
                'Mismatch Count': expected_count,
                'Status': 'Error',
                'Error': str(e)
            })
        
        # Sleep briefly to be polite to the UniProt API
        time.sleep(0.5)

    # --- Generate Metadata Report ---
    metadata_df = pd.DataFrame(metadata_records)
    metadata_df.to_csv(metadata_output_csv, index=False)
    
    print("\nProcessing Complete!")
    print(f"Sequence Data Saved to: {data_output_csv}")
    print(f"Metadata Report Saved to: {metadata_output_csv}")

# ==========================================
# Execution Configuration
# ==========================================

# File paths
input_file = 'processed_proteomes_thermoproteota_data.csv'       # The file from Step 1
data_file = 'thermoproteota_data.csv'            # The final output with sequences
metadata_file = 'thermoproteota_metadata.csv'    # The report file

# Run the function
if __name__ == "__main__":
    download_and_verify_proteomes(input_file, data_file, metadata_file)

Reading input file: processed_proteomes_thermoproteota_data.csv...
Created new data file: thermoproteota_data.csv
Starting download process...
[1/250] UP000005867: Exp=2825, Got=2825 (Match)
[2/250] UP000053352: Exp=1602, Got=1602 (Match)
[3/250] UP000193404: Exp=2641, Got=2641 (Match)
[4/250] UP000196694: Exp=1990, Got=1990 (Match)
[5/250] UP000248044: Exp=2858, Got=2858 (Match)
[6/250] UP000298568: Exp=2310, Got=2310 (Match)
[7/250] UP000322983: Exp=2770, Got=2770 (Match)
[8/250] UP000423396: Exp=2042, Got=2042 (Match)
[9/250] UP000427373: Exp=2759, Got=2759 (Match)
[10/250] UP000440125: Exp=2293, Got=2293 (Match)
[11/250] UP000509301: Exp=2154, Got=2154 (Match)
[12/250] UP000554766: Exp=2265, Got=2265 (Match)
[13/250] UP000593766: Exp=1399, Got=1399 (Match)
[14/250] UP000594121: Exp=1763, Got=1763 (Match)
[15/250] UP000694036: Exp=3121, Got=3121 (Match)
[16/250] UP000825123: Exp=2971, Got=2971 (Match)
[17/250] UP001319921: Exp=3274, Got=3274 (Match)
[18/250] UP001432202: Exp=2714, G

In [3]:
df = pd.read_csv(data_file)
len(df)

229906