Gene extraction

In [None]:
from Bio import Entrez
from http.client import IncompleteRead
import time


# Input file containing gene names
gene_file = "genes_by_category/matrisome/collagens.txt"

# Output FASTA file for large genes (protein sequences)
output_fasta = "genes_by_category/collagens.fasta"

# Function to fetch sequences robustly with retries
def robust_fetch(fetch_function, retries=3, wait_time=5):
    for attempt in range(retries):
        try:
            return fetch_function()
        except IncompleteRead as e:
            print(f"IncompleteRead error: {e}. Retrying {attempt + 1}/{retries}...")
            time.sleep(wait_time)
    raise Exception("Failed to fetch data after multiple retries")

# Fetch protein sequences from NCBI
def fetch_protein_sequence(gene_name):
    try:
        # Search for the gene in the NCBI protein database (restricted to Homo sapiens)
        query = f"{gene_name}[Gene] AND Homo sapiens[Organism]"
        handle = Entrez.esearch(db="protein", term=query, retmax=1)
        record = Entrez.read(handle)
        handle.close()

        # Fetch the protein sequence by ID
        if record["IdList"]:
            seq_id = record["IdList"][0]
            print(f"Fetching protein sequence for {gene_name} (ID: {seq_id})...")
            handle = robust_fetch(
                lambda: Entrez.efetch(
                    db="protein", id=seq_id, rettype="fasta", retmode="text"
                )
            )
            sequence = handle.read()
            handle.close()
            return sequence
        else:
            print(f"No protein sequence found for {gene_name}")
            return None
    except Exception as e:
        print(f"Error fetching {gene_name}: {e}")
        return None

# Read gene names and fetch protein sequences
with open(gene_file, "r") as file:
    genes = [line.strip() for line in file if line.strip()]

with open(output_fasta, "w") as fasta_out:
    for gene in genes:
        protein_sequence = fetch_protein_sequence(gene)
        if protein_sequence:
            fasta_out.write(protein_sequence)

print(f"Protein sequences saved to: {output_fasta}")