In [15]:
import requests

# Query parameters
gene = "ADNP"
organism = "9606"  # Human
query = f'gene_exact:{gene} AND organism_id:{organism} AND reviewed:true'

# UniProt API endpoint
url = "https://rest.uniprot.org/uniprotkb/search"
params = {
    "query": query,
    "fields": "accession,gene_names,protein_name,organism_name",
    "format": "json",
    "size": 500
}

# Make the request
res = requests.get(url, params=params)
data = res.json()
results_list = data.get("results", [])

# Print the query and results
print(f"Query: {query}")
print(f"\nNumber of results: {len(results_list)}")

if results_list:
    print("\nResults:")
    for result in results_list:
        print(f"\nAccession: {result.get('primaryAccession')}")
        print(f"Gene Names: {result.get('genes', [])}")
        print(f"Protein Name: {result.get('proteinName', {}).get('value', '')}")
        print(f"Organism: {result.get('organism', {}).get('scientificName', '')}")
else:
    print("No results found")

Query: gene_exact:ADNP AND organism_id:9606 AND reviewed:true

Number of results: 1

Results:

Accession: Q9H2P0
Gene Names: [{'geneName': {'value': 'ADNP'}, 'synonyms': [{'value': 'ADNP1'}, {'value': 'KIAA0784'}]}]
Protein Name: 
Organism: Homo sapiens


In [19]:
import pandas as pd
import requests
from tqdm import tqdm

# Read the Excel file
df1 = pd.read_excel('/Users/conny/Desktop/AlphaFold/Summer Project/data/Paper 3 data.xlsx', 
                    sheet_name="Significant_Prots_GO", 
                    header=3,
                    usecols=[0, 1])

# Rename columns
df1.columns = ['Gene Symbol', 'GO Annotation']

# Remove any rows where either Gene Symbol or GO Annotation is NaN
df1 = df1.dropna(subset=['Gene Symbol', 'GO Annotation'])

# Create the list of tuples
gene_desc_list = list(zip(df1['Gene Symbol'], df1['GO Annotation']))

# Print all genes and their GO annotations
print("All genes in the dataset:")
for gene, go in gene_desc_list:
    print(f"\nGene: {gene}")
    print(f"GO Annotation: {go}")

# Now let's query UniProt for each gene
organism = "9606"  # Human
results = []
failed = []
multiple_reviewed = []

print(f"\nTotal number of queries to process: {len(gene_desc_list)}")

# In the main loop where we process each gene
for gene, go_annotation in tqdm(gene_desc_list, desc="Querying UniProt"):
    # Query using the exact gene name format from our list
    query = f'gene_exact:{gene} AND organism_id:{organism} AND reviewed:true'
    url = "https://rest.uniprot.org/uniprotkb/search"
    params = {
        "query": query,
        "fields": "accession,gene_names,protein_name,organism_name",
        "format": "json",
        "size": 500
    }
    try:
        res = requests.get(url, params=params, timeout=30)
        data = res.json()
        results_list = data.get("results", [])
        
        if not results_list:
            failed.append({
                "gene": gene,
                "go_annotation": go_annotation
            })
        elif len(results_list) > 1:
            # If multiple reviewed entries found, only add to multiple_reviewed
            multiple_reviewed.append({
                "gene": gene,
                "go_annotation": go_annotation,
                "accessions": [r.get("primaryAccession") for r in results_list],
                "protein_names": [r.get("proteinName", {}).get("value", "") for r in results_list]
            })
        else:
            # Only add to results if there's exactly one reviewed entry
            for result in results_list:
                results.append({
                    "gene": gene,
                    "go_annotation": go_annotation,
                    "accession": result.get("primaryAccession"),
                    "protein_name": result.get("proteinName", {}).get("value", "")
                })
            
    except Exception as e:
        failed.append({
            "gene": gene,
            "go_annotation": go_annotation,
            "error": str(e)
        })
        print(f"Error for {gene}: {e}")

# Save results
results_df = pd.DataFrame(results)
results_df.to_csv("paper3_gene_results.csv", index=False)

# Save failed queries
if failed:
    failed_df = pd.DataFrame(failed)
    failed_df.to_csv("paper3_failed_queries.csv", index=False)

# Save multiple reviewed results
if multiple_reviewed:
    multiple_df = pd.DataFrame(multiple_reviewed)
    multiple_df.to_csv("paper3_multiple_reviewed.csv", index=False)

# Print summary statistics
print("\nSummary Statistics:")
print(f"Total queries processed: {len(gene_desc_list)}")
print(f"Successfully retrieved entries: {len(results)}")
print(f"Failed queries: {len(failed)}")
print(f"Genes with multiple reviewed entries: {len(multiple_reviewed)}")

# Print failed queries
if failed:
    print(f"\nFailed queries ({len(failed)}):")
    for item in failed:
        print(f"{item['gene']}: {item['go_annotation']}")
    print("\nSaved failed queries to paper3_failed_queries.csv")

# Print multiple reviewed results
if multiple_reviewed:
    print(f"\nMultiple reviewed entries ({len(multiple_reviewed)}):")
    for item in multiple_reviewed:
        print(f"\nGene: {item['gene']}")
        print(f"GO Annotation: {item['go_annotation']}")
        print("Accessions and protein names:")
        for acc, name in zip(item['accessions'], item['protein_names']):
            print(f"  {acc}: {name}")
    print("\nSaved multiple reviewed results to paper3_multiple_reviewed.csv")

print("\nOutput files:")
print("1. paper3_gene_results.csv - Main results")
print("2. paper3_failed_queries.csv - Failed queries")
print("3. paper3_multiple_reviewed.csv - Multiple reviewed entries")

All genes in the dataset:

Gene: ADNP
GO Annotation: Chromatin Binding (GO:0003682)

Gene: AHCTF1
GO Annotation: Nucleic Acid Binding (GO:0003676)

Gene: ALKBH2
GO Annotation: Modifying Enzyme (GO:0006304)

Gene: ALX4
GO Annotation: Transcriptional Regulator (GO:0140110)

Gene: ALYREF
GO Annotation: Nucleic Acid Binding (GO:0003676)

Gene: ANKRD11
GO Annotation: Other

Gene: ANP32E
GO Annotation: Other

Gene: ARID3B
GO Annotation: Nucleic Acid Binding (GO:0003676)

Gene: ARL15
GO Annotation: Other

Gene: ARL6IP4
GO Annotation: Nucleic Acid Binding (GO:0003676)

Gene: ARNT
GO Annotation: Transcriptional Regulator (GO:0140110)

Gene: ASH1L
GO Annotation: Chromatin Binding (GO:0003682)

Gene: ASTE1
GO Annotation: Other

Gene: ATAD5
GO Annotation: Nucleic Acid Binding (GO:0003676)

Gene: ATF7
GO Annotation: Transcriptional Regulator (GO:0140110)

Gene: ATRX
GO Annotation: Modifying Enzyme (GO:0006304)

Gene: AUNIP
GO Annotation: Cytoskeletal Protein (GO:0005856)

Gene: AURKA
GO Annotation:

Querying UniProt: 100%|██████████| 441/441 [03:17<00:00,  2.23it/s]


Summary Statistics:
Total queries processed: 441
Successfully retrieved entries: 413
Failed queries: 17
Genes with multiple reviewed entries: 11

Failed queries (17):
BPGAP1: Other
DKFZp564C172: Other
DKFZp666G145: Nucleic Acid Binding (GO:0003676)
DKFZp686H0575: Transcriptional Regulator (GO:0140110)
DKFZp686L1814: Other
DKFZp686L2367: Other
DKFZp686P16143: Other
EWSR1/ATF1 fusion: Other
hCG_2042749: Other
hCG_20884: Other
hCG_96198: Chromatin Binding (GO:0003682)
HCTP4: Cytoskeletal Protein (GO:0005856)
HEL106: Nucleic Acid Binding (GO:0003676)
HEL23: Transcriptional Regulator (GO:0140110)
LOC115098: Other
LTG9: Other
MGC12965: Other

Saved failed queries to paper3_failed_queries.csv

Multiple reviewed entries (11):

Gene: DDX21
GO Annotation: Nucleic Acid Binding (GO:0003676)
Accessions and protein names:
  Q9NR30: 
  Q9NY93: 

Gene: ERCC6
GO Annotation: Chromatin Binding (GO:0003682)
Accessions and protein names:
  Q03468: 
  P0DP91: 

Gene: LIG3
GO Annotation: Nucleic Acid Bindin




In [20]:
import pandas as pd
import requests
from tqdm import tqdm

# Read the failed queries CSV
failed_df = pd.read_csv('paper3_failed_queries.csv')

# Read existing gene results to append to
existing_results = pd.read_csv('paper3_gene_results.csv')

# Initialize lists for tracking results
new_results = []
still_multiple = []
no_results = []

# Perform search for each failed query
for _, row in tqdm(failed_df.iterrows(), desc="Searching failed queries"):
    gene = row['gene']
    go_annotation = row['go_annotation']
    
    # Simple query with just gene name and taxonomy, without reviewed criterion
    query = f'gene_exact:{gene} AND organism_id:9606'
    
    url = "https://rest.uniprot.org/uniprotkb/search"
    params = {
        "query": query,
        "fields": "accession,gene_names,protein_name,organism_name,reviewed",
        "format": "json",
        "size": 500
    }
    
    try:
        res = requests.get(url, params=params, timeout=30)
        data = res.json()
        results_list = data.get("results", [])
        
        if not results_list:
            # No results found
            no_results.append({
                "gene": gene,
                "go_annotation": go_annotation
            })
        elif len(results_list) > 1:
            # Multiple results found
            still_multiple.append({
                "gene": gene,
                "go_annotation": go_annotation,
                "num_results": len(results_list),
                "accessions": [r.get("primaryAccession") for r in results_list],
                "protein_names": [r.get("proteinName", {}).get("value", "") for r in results_list],
                "reviewed_status": [r.get("reviewed", False) for r in results_list]
            })
        else:
            # Single result found
            result = results_list[0]
            new_results.append({
                "gene": gene,
                "go_annotation": go_annotation,
                "accession": result.get("primaryAccession"),
                "protein_name": result.get("proteinName", {}).get("value", ""),
                "reviewed": result.get("reviewed", False)
            })
            
    except Exception as e:
        print(f"Error for {gene}: {e}")

# Convert new results to DataFrame
new_results_df = pd.DataFrame(new_results)

# Combine with existing results
combined_results = pd.concat([existing_results, new_results_df], ignore_index=True)

# Remove any duplicates based on gene and accession
combined_results = combined_results.drop_duplicates(subset=['gene', 'accession'])

# Save the final results
combined_results.to_csv("paper3_gene_results_final.csv", index=False)

# Print detailed statistics
print(f"\nOriginal results count: {len(existing_results)}")
print(f"New results added: {len(new_results)}")
print(f"Final combined results count: {len(combined_results)}")
print("\nSaved updated results to paper3_gene_results_final.csv")

# Print detailed analysis
print("\nFailed Queries Analysis:")
print(f"Total failed queries processed: {len(failed_df)}")
print(f"Successfully resolved to single result: {len(new_results)}")
print(f"Still have multiple results: {len(still_multiple)}")
print(f"No results found: {len(no_results)}")

if still_multiple:
    print("\n=== Genes Still Having Multiple Results ===")
    for item in still_multiple:
        print(f"\nGene: {item['gene']}")
        print(f"GO Annotation: {item['go_annotation']}")
        print(f"Number of results: {item['num_results']}")
        print("Accessions and protein names:")
        for acc, name, reviewed in zip(item['accessions'], item['protein_names'], item['reviewed_status']):
            print(f"  {acc}: {name} (Reviewed: {reviewed})")

if no_results:
    print("\n=== Genes With No Results ===")
    for item in no_results:
        print(f"{item['gene']}: {item['go_annotation']}")

# Print statistics about reviewed vs unreviewed entries
if 'reviewed' in new_results_df.columns:
    reviewed_count = new_results_df['reviewed'].sum()
    unreviewed_count = len(new_results_df) - reviewed_count
    print(f"\nNew entries statistics:")
    print(f"Reviewed entries: {reviewed_count}")
    print(f"Unreviewed entries: {unreviewed_count}")

Searching failed queries: 17it [00:07,  2.13it/s]


Original results count: 413
New results added: 12
Final combined results count: 425

Saved updated results to paper3_gene_results_final.csv

Failed Queries Analysis:
Total failed queries processed: 17
Successfully resolved to single result: 12
Still have multiple results: 1
No results found: 4

=== Genes Still Having Multiple Results ===

Gene: EWSR1/ATF1 fusion
GO Annotation: Other
Number of results: 2
Accessions and protein names:
  F1JVV6:  (Reviewed: False)
  F1JVV5:  (Reviewed: False)

=== Genes With No Results ===
DKFZp686L1814: Other
hCG_20884: Other
LOC115098: Other
LTG9: Other

New entries statistics:
Reviewed entries: 0
Unreviewed entries: 12





In [22]:
import pandas as pd
import requests
from tqdm import tqdm
import time

# Read the multiple reviewed CSV
multiple_df = pd.read_csv('paper3_multiple_reviewed.csv')
print(f"\nOriginal number of multiple reviewed proteins: {len(multiple_df)}")

# Read existing gene results to append to
existing_results = pd.read_csv('paper3_gene_results_final.csv')

# Initialize lists for tracking results
new_results = []
still_multiple = []
no_results = []

# Function to clean GO annotation for search
def clean_go_annotation(go):
    # Remove common prefixes and clean up the GO annotation
    go = go.lower()
    # Remove any parenthetical information
    go = go.split('(')[0].strip()
    # Remove any semicolons and take first part
    go = go.split(';')[0].strip()
    return go

# Perform refined search for each multiple reviewed entry
for _, row in tqdm(multiple_df.iterrows(), desc="Refining multiple reviewed entries"):
    gene = row['gene']
    go_annotation = row['go_annotation']
    
    # Clean the GO annotation for search
    clean_go = clean_go_annotation(go_annotation)
    
    # Create a more specific query using both gene and GO annotation
    query = f'gene_exact:{gene} AND organism_id:9606 AND reviewed:true AND go:"{clean_go}"'
    
    url = "https://rest.uniprot.org/uniprotkb/search"
    params = {
        "query": query,
        "fields": "accession,gene_names,protein_name,organism_name,go",
        "format": "json",
        "size": 500
    }
    
    try:
        res = requests.get(url, params=params, timeout=30)
        data = res.json()
        results_list = data.get("results", [])
        
        if not results_list:
            # No results found
            no_results.append({
                "gene": gene,
                "go_annotation": go_annotation,
                "query": query
            })
        elif len(results_list) > 1:
            # Still multiple results
            still_multiple.append({
                "gene": gene,
                "go_annotation": go_annotation,
                "num_results": len(results_list),
                "accessions": [r.get("primaryAccession") for r in results_list],
                "protein_names": [r.get("proteinName", {}).get("value", "") for r in results_list],
                "go_terms": [r.get("go", []) for r in results_list]
            })
        else:
            # Single result found
            for result in results_list:
                new_results.append({
                    "gene": gene,
                    "go_annotation": go_annotation,
                    "accession": result.get("primaryAccession"),
                    "protein_name": result.get("proteinName", {}).get("value", ""),
                    "go_terms": result.get("go", [])
                })
            
    except Exception as e:
        print(f"Error for {gene}: {e}")

# Convert new results to DataFrame
new_results_df = pd.DataFrame(new_results)

# Combine with existing results
combined_results = pd.concat([existing_results, new_results_df], ignore_index=True)

# Remove any duplicates based on gene and accession
combined_results = combined_results.drop_duplicates(subset=['gene', 'accession'])

# Save the updated results
combined_results.to_csv("paper3_gene_results_finalfinal.csv", index=False)

# Save still multiple results
if still_multiple:
    still_multiple_df = pd.DataFrame(still_multiple)
    still_multiple_df.to_csv("paper3_still_multiple.csv", index=False)

# Save no results
if no_results:
    no_results_df = pd.DataFrame(no_results)
    no_results_df.to_csv("paper3_no_results.csv", index=False)

# Print detailed summary
print("\n=== Detailed Results Summary ===")
print(f"Original multiple reviewed proteins: {len(multiple_df)}")
print(f"Successfully resolved to single result: {len(new_results)}")
print(f"Still have multiple results: {len(still_multiple)}")
print(f"No results found: {len(no_results)}")

if still_multiple:
    print("\n=== Genes Still Having Multiple Results ===")
    for item in still_multiple:
        print(f"\nGene: {item['gene']}")
        print(f"GO Annotation: {item['go_annotation']}")
        print(f"Number of results: {item['num_results']}")
        print("Accessions and protein names:")
        for acc, name, go_terms in zip(item['accessions'], item['protein_names'], item['go_terms']):
            print(f"  {acc}: {name}")
            print(f"  GO Terms: {go_terms}")

if no_results:
    print("\n=== Genes With No Results ===")
    for item in no_results:
        print(f"{item['gene']}: {item['go_annotation']}")

print(f"\nOriginal results count: {len(existing_results)}")
print(f"New results added: {len(new_results)}")
print(f"Final combined results count: {len(combined_results)}")
print("\nSaved updated results to paper3_gene_results_finalfinal.csv")


Original number of multiple reviewed proteins: 11


Refining multiple reviewed entries: 11it [00:07,  1.44it/s]


=== Detailed Results Summary ===
Original multiple reviewed proteins: 11
Successfully resolved to single result: 6
Still have multiple results: 2
No results found: 3

=== Genes Still Having Multiple Results ===

Gene: DDX21
GO Annotation: Nucleic Acid Binding (GO:0003676)
Number of results: 2
Accessions and protein names:
  Q9NY93: 
  GO Terms: []
  Q9NR30: 
  GO Terms: []

Gene: TMPO
GO Annotation: Nucleic Acid Binding (GO:0003676)
Number of results: 2
Accessions and protein names:
  P42166: 
  GO Terms: []
  P42167: 
  GO Terms: []

=== Genes With No Results ===
PC4: Transcriptional Regulator (GO:0140110)
SCRIB: Other
SF1: Transcriptional Regulator (GO:0140110)

Original results count: 425
New results added: 6
Final combined results count: 431

Saved updated results to paper3_gene_results_finalfinal.csv





In [4]:
import pandas as pd
import requests
from tqdm import tqdm
import sys
import json
import time
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import random

def fetch_sequence_and_info(protein_id):
    """
    Fetch sequence, protein name, and gene name from UniProt
    """
    url = f"https://rest.uniprot.org/uniprotkb/{protein_id}"
    params = {"fields": ["sequence", "protein_name", "gene_names"]}
    headers = {"accept": "application/json"}
    
    response = requests.get(url, headers=headers, params=params)
    if response.ok:
        data = response.json()
        sequence = data.get("sequence", {}).get("value", None)
        protein_name = data.get("proteinDescription", {}).get("recommendedName", {}).get("fullName", {}).get("value", "")
        gene_name = data.get("genes", [{}])[0].get("geneName", {}).get("value", "")
        return sequence, protein_name, gene_name
    else:
        print(f"Warning: Failed to fetch {protein_id}", file=sys.stderr)
        return None, None, None

def main():
    # Read the CSV file with accession numbers
    csv_file = "/Users/conny/Desktop/AlphaFold/Summer Project/result/3 Paper Individual/paper3_fasta_results.csv"
    
    # Read the relevant columns
    df = pd.read_csv(csv_file)
    
    # Extract protein IDs and GO annotations
    protein_data = df[['accession', 'go_annotation']]

    print(f"Processing {len(protein_data)} proteins...")
    
    # Get sequences and create results
    results = []
    for _, row in tqdm(protein_data.iterrows(), desc="Fetching sequences"):
        accession = row['accession']
        go_annotation = row['go_annotation']
        
        sequence, protein_name, gene_name = fetch_sequence_and_info(accession)
        if sequence:
            results.append({
                'accession': accession,
                'gene': gene_name,  # Added gene name
                'go_annotation': go_annotation,
                'protein_name': protein_name,
                'sequence': sequence
            })
    
    # Save results to CSV
    results_df = pd.DataFrame(results)
    results_df.to_csv("paper3_fasta_results.csv", index=False)
    
    # Save sequences to FASTA file with gene names and protein names in headers
    with open("paper3_fasta_results.fasta", "w") as f:
        for result in results:
            f.write(f">{result['accession']}\n")
            f.write(f"{result['sequence']}\n")
    
    print("\nResults summary:")
    print(f"- Total proteins processed: {len(protein_data)}")
    print(f"- Successfully retrieved: {len(results)} sequences")
    print("\nOutput files:")
    print("- paper3_fasta_results.csv: Contains all retrieved entries with gene names, GO annotations, protein names, and sequences")
    print("- paper3_fasta_results.fasta: Contains protein sequences in FASTA format with gene names and protein names in headers")

if __name__ == "__main__":
    main()

Processing 440 proteins...


Fetching sequences: 440it [03:07,  2.34it/s]


Results summary:
- Total proteins processed: 440
- Successfully retrieved: 439 sequences

Output files:
- paper3_fasta_results.csv: Contains all retrieved entries with gene names, GO annotations, protein names, and sequences
- paper3_fasta_results.fasta: Contains protein sequences in FASTA format with gene names and protein names in headers



