In [1]:
import pandas as pd

In [8]:
# Reload the newly uploaded file
file_path_new = 'gene-aging-mechanisms.tsv'
genes_df_new = pd.read_csv(file_path_new, sep='\t', header=None)

# Extract the first word from each row
genes_df_new['Gene'] = genes_df_new[0].str.extract(r'^"(\w+)"')

# Create a clean list of gene names
gene_list_new = genes_df_new['Gene'].dropna().tolist()

# Display the first few gene names
gene_list_new[:10]


[]

In [9]:
# Read the raw content of the file to understand its structure
with open(file_path_new, 'r') as file:
    raw_lines = file.readlines()

# Display the first few lines to inspect their structure
raw_lines[:10]


['"GHR"\t"\'INS/IGF-1 pathway dysregulation\'"\n',
 '"GHRH"\t"\'INS/IGF-1 pathway dysregulation\',\'intercellular communication impairment\'"\n',
 '"SHC1"\t"\'INS/IGF-1 pathway dysregulation\'"\n',
 '"POU1F1"\t"\'transcriptional alterations\'"\n',
 '"PROP1"\t"\'transcriptional alterations\'"\n',
 '"TP53"\t"\'mitochondrial DNA instability\',\'nuclear DNA instability\',\'chromatin remodeling\',\'transcriptional alterations\',\'impairment of the mitochondrial integrity and biogenesis\',\'accumulation of reactive oxygen species\',\'senescent cells accumulation\',\'stem cell exhaustion\',\'intercellular communication impairment\',\'disabled macroautophagy\'"\n',
 '"TERC"\t""\n',
 '"TERT"\t"\'nuclear DNA instability\',\'telomere attrition\',\'impairment of the mitochondrial integrity and biogenesis\',\'senescent cells accumulation\'"\n',
 '"ATM"\t"\'nuclear DNA instability\',\'telomere attrition\',\'alterations in histone modifications\',\'transcriptional alterations\',\'TOR pathway dysregul

In [11]:
import re

In [12]:
# Extract the first quoted word (gene name) from each row
gene_list_cleaned = [re.match(r'"(\w+)"', line).group(1) for line in raw_lines if re.match(r'"(\w+)"', line)]

# Display the first few extracted gene names
gene_list_cleaned[:10]


['GHR',
 'GHRH',
 'SHC1',
 'POU1F1',
 'PROP1',
 'TP53',
 'TERC',
 'TERT',
 'ATM',
 'PLAU']

In [13]:
# Save the extracted gene list to a text file
output_file_path = 'aging_related_genes.txt'

# Write the gene names to the file, one per line
with open(output_file_path, 'w') as file:
    file.write('\n'.join(gene_list_cleaned))

output_file_path


'aging_related_genes.txt'

In [14]:
import Bio

In [15]:
from Bio import Entrez
from Bio import SeqIO


# Input and output file paths
gene_file = 'aging_related_genes.txt'
output_fasta = 'aging_related_genes_nucleotide.fasta'

# Read the gene list from the file
with open(gene_file, 'r') as file:
    genes = [line.strip() for line in file if line.strip()]

# Open the output FASTA file
with open(output_fasta, 'w') as fasta_out:
    for gene in genes:
        try:
            # Search for the gene in the NCBI nucleotide database
            handle = Entrez.esearch(db="nucleotide", term=f"{gene}[Gene]", retmax=1)
            record = Entrez.read(handle)
            handle.close()

            # Fetch the sequence by ID if available
            if record["IdList"]:
                seq_id = record["IdList"][0]
                handle = Entrez.efetch(db="nucleotide", id=seq_id, rettype="fasta", retmode="text")
                fasta_out.write(handle.read())
                handle.close()
                print(f"Fetched sequence for {gene}")
            else:
                print(f"No sequence found for {gene}")
        except Exception as e:
            print(f"Error fetching {gene}: {e}")

print(f"FASTA file saved to: {output_fasta}")


            Email address is not specified.

            To make use of NCBI's E-utilities, NCBI requires you to specify your
            email address with each request.  As an example, if your email address
            is A.N.Other@example.com, you can specify it as follows:
               from Bio import Entrez
               Entrez.email = 'A.N.Other@example.com'
            In case of excessive usage of the E-utilities, NCBI will attempt to contact
            a user at the email address provided before blocking access to the
            E-utilities.


Fetched sequence for GHR
Fetched sequence for GHRH
Fetched sequence for SHC1
Fetched sequence for POU1F1
Fetched sequence for PROP1
Fetched sequence for TP53
Fetched sequence for TERC
Fetched sequence for TERT
Fetched sequence for ATM
Fetched sequence for PLAU
Fetched sequence for ERCC2
Fetched sequence for ERCC8
Fetched sequence for WRN
Fetched sequence for LMNA
Fetched sequence for IGF1R
Fetched sequence for TXN
Fetched sequence for KL
Fetched sequence for E2F1
Fetched sequence for PTPN11
Fetched sequence for NFKB2
Fetched sequence for STAT5B
Fetched sequence for STAT3
Fetched sequence for STAT5A
Fetched sequence for NRG1
Fetched sequence for HDAC3
Fetched sequence for GH1
Fetched sequence for IL7R
Fetched sequence for IGF1
Fetched sequence for IGF2
Fetched sequence for INS
Fetched sequence for NGF
Fetched sequence for IRS1
Fetched sequence for PTPN1
Fetched sequence for IRS2
Fetched sequence for AKT1
Fetched sequence for PIK3CB
Fetched sequence for NGFR
Fetched sequence for HRAS
Fet

In [16]:
from Bio import Entrez
from Bio import SeqIO



# Input and output file paths
gene_file = 'aging_related_genes.txt'
output_fasta = 'aging_related_genes_human_nucleotide.fasta'

# Read the gene list from the file
with open(gene_file, 'r') as file:
    genes = [line.strip() for line in file if line.strip()]

# Open the output FASTA file
with open(output_fasta, 'w') as fasta_out:
    for gene in genes:
        try:
            # Search for the gene in the NCBI nucleotide database, restricted to Homo sapiens
            query = f"{gene}[Gene] AND Homo sapiens[Organism]"
            handle = Entrez.esearch(db="nucleotide", term=query, retmax=1)
            record = Entrez.read(handle)
            handle.close()

            # Fetch the sequence by ID if available
            if record["IdList"]:
                seq_id = record["IdList"][0]
                handle = Entrez.efetch(db="nucleotide", id=seq_id, rettype="fasta", retmode="text")
                fasta_out.write(handle.read())
                handle.close()
                print(f"Fetched sequence for {gene} (Homo sapiens)")
            else:
                print(f"No sequence found for {gene} (Homo sapiens)")
        except Exception as e:
            print(f"Error fetching {gene}: {e}")

print(f"FASTA file saved to: {output_fasta}")


Fetched sequence for GHR (Homo sapiens)
Fetched sequence for GHRH (Homo sapiens)
Fetched sequence for SHC1 (Homo sapiens)
Fetched sequence for POU1F1 (Homo sapiens)
Fetched sequence for PROP1 (Homo sapiens)
Fetched sequence for TP53 (Homo sapiens)
Fetched sequence for TERC (Homo sapiens)
Fetched sequence for TERT (Homo sapiens)
Fetched sequence for ATM (Homo sapiens)
Fetched sequence for PLAU (Homo sapiens)
Fetched sequence for ERCC2 (Homo sapiens)
Fetched sequence for ERCC8 (Homo sapiens)
Fetched sequence for WRN (Homo sapiens)
Fetched sequence for LMNA (Homo sapiens)
Fetched sequence for IGF1R (Homo sapiens)
Fetched sequence for TXN (Homo sapiens)
Fetched sequence for KL (Homo sapiens)
Fetched sequence for E2F1 (Homo sapiens)
Fetched sequence for PTPN11 (Homo sapiens)
Fetched sequence for NFKB2 (Homo sapiens)
Fetched sequence for STAT5B (Homo sapiens)
Fetched sequence for STAT3 (Homo sapiens)
Fetched sequence for STAT5A (Homo sapiens)
Fetched sequence for NRG1 (Homo sapiens)
Fetched 

Error fetching ZFYVE28: IncompleteRead(35528704 bytes read)
Error fetching NRXN1: IncompleteRead(65572864 bytes read)
Error fetching HNRNPA1: IncompleteRead(56152064 bytes read)

Error fetching SIGLEC12: IncompleteRead(55279616 bytes read)
Error fetching IL1RN: IncompleteRead(82100224 bytes read)
Error fetching ASIC2: IncompleteRead(10686464 bytes read)
Error fetching KCNA5: IncompleteRead(39976960 bytes read)
Error fetching NOTCH4: IncompleteRead(49115136 bytes read)
Error fetching TNFSF8: IncompleteRead(144797696 bytes read)
Error fetching TGFA: IncompleteRead(105271296 bytes read)
Error fetching SVEP1: IncompleteRead(38047744 bytes read)
Error fetching MACF1: IncompleteRead(64745472 bytes read)
Error fetching PYHIN1: IncompleteRead(50380800 bytes read)
Error fetching AHSA2P: IncompleteRead(226369536 bytes read)
Error fetching CXCL2: IncompleteRead(65564672 bytes read)
Error fetching PPIC: IncompleteRead(61505536 bytes read)

Error fetching C1QB: IncompleteRead(11661312 bytes read)
Error fetching MMP3: IncompleteRead(36151296 bytes read)
Error fetching WNT2: IncompleteRead(121958400 bytes read)
Error fetching TRIB2: IncompleteRead(122757120 bytes read)
Error fetching RAP2A: IncompleteRead(91770880 bytes read)
Error fetching POLR3K: IncompleteRead(77524992 bytes read)
Error fetching LGALS1: IncompleteRead(28577792 bytes read)
Error fetching SPATS2L: IncompleteRead(79675392 bytes read)
Error fetching GLRX: IncompleteRead(77508608 bytes read)
Error fetching FKBP5: IncompleteRead(43651072 bytes read)
Error fetching STK24: IncompleteRead(10235904 bytes read)
Error fetching RAP1B: IncompleteRead(27213824 bytes read)
Error fetching TLN2: IncompleteRead(21323776 bytes read)
Error fetching RPTOR: IncompleteRead(17362944 bytes read)
Error fetching TIMP2: IncompleteRead(13307904 bytes read)
Error fetching WNT10B: IncompleteRead(31256576 bytes read)
Error fetching PTGER2: IncompleteRead(25518080 bytes read)
Error fetching FLJ22555: IncompleteRead(5140480 bytes read)
Error fetching CA3: IncompleteRead(28971008 bytes read)
Error fetching RXRA: IncompleteRead(70111232 bytes read)
Error fetching IL6ST: IncompleteRead(149897216 bytes read)
Error fetching POU5F1: IncompleteRead(108421120 bytes read)
Error fetching CITED4: IncompleteRead(114507776 bytes read)
Error fetching DYRK2: IncompleteRead(41107456 bytes read)
Error fetching MAP1B: IncompleteRead(41226240 bytes read)
Error fetching HDAC9: IncompleteRead(42426368 bytes read)
Error fetching ME1: IncompleteRead(31883264 bytes read)
Error fetching RTN4: IncompleteRead(12463978 bytes read)
Error fetching FOXN3: IncompleteRead(52731904 bytes read)
Error fetching MAGED2: IncompleteRead(30130176 bytes read)
Error fetching CALM3: IncompleteRead(2240512 bytes read)
Error fetching ITM2A: IncompleteRead(70090752 bytes read)
Error fetching DNTTIP2: IncompleteRead(82726912 bytes read)
Error fetching PAQR9: IncompleteRead(32595968 bytes read)
Error fetching STOM: IncompleteRead(14168064 bytes read)
Error fetching MAN1C1: IncompleteRead(42246144 bytes read)
Error fetching UBL3: IncompleteRead(51806208 bytes read)
Error fetching CHCHD4: IncompleteRead(35942400 bytes read)
Error fetching RBM15: IncompleteRead(30986240 bytes read)
Error fetching SPTSSA: IncompleteRead(32428032 bytes read)
Error fetching EEF2K: IncompleteRead(59785216 bytes read)
Error fetching GGH: IncompleteRead(71614464 bytes read)

Error fetching ALOX12: IncompleteRead(5042176 bytes read)
Error fetching CD28: IncompleteRead(64241664 bytes read)
Error fetching SEMA6D: IncompleteRead(30158848 bytes read)
Error fetching RBM3: IncompleteRead(98680832 bytes read)
Error fetching ANXA7: IncompleteRead(120233984 bytes read)
Error fetching GNA13: IncompleteRead(82116608 bytes read)
Error fetching ADH1A: IncompleteRead(84508672 bytes read)
Error fetching GNG11: IncompleteRead(49631232 bytes read)
Error fetching LPIN1: IncompleteRead(95023104 bytes read)
Error fetching PHLDA1: IncompleteRead(10612736 bytes read)
Error fetching SORD: IncompleteRead(43257856 bytes read)
Error fetching ATP5MC3: IncompleteRead(37822464 bytes read)
Error fetching TBX3: IncompleteRead(54087680 bytes read)
Error fetching PGK1: IncompleteRead(14262272 bytes read)
Error fetching TXNRD1: IncompleteRead(17760256 bytes read)







In [17]:
# List of error messages
error_messages = """
Error fetching ZFYVE28: IncompleteRead(35528704 bytes read)
Error fetching NRXN1: IncompleteRead(65572864 bytes read)
Error fetching HNRNPA1: IncompleteRead(56152064 bytes read)
Error fetching SIGLEC12: IncompleteRead(55279616 bytes read)
Error fetching IL1RN: IncompleteRead(82100224 bytes read)
Error fetching ASIC2: IncompleteRead(10686464 bytes read)
Error fetching KCNA5: IncompleteRead(39976960 bytes read)
Error fetching NOTCH4: IncompleteRead(49115136 bytes read)
Error fetching TNFSF8: IncompleteRead(144797696 bytes read)
Error fetching TGFA: IncompleteRead(105271296 bytes read)
Error fetching SVEP1: IncompleteRead(38047744 bytes read)
Error fetching MACF1: IncompleteRead(64745472 bytes read)
Error fetching PYHIN1: IncompleteRead(50380800 bytes read)
Error fetching AHSA2P: IncompleteRead(226369536 bytes read)
Error fetching CXCL2: IncompleteRead(65564672 bytes read)
Error fetching PPIC: IncompleteRead(61505536 bytes read)
Error fetching C1QB: IncompleteRead(11661312 bytes read)
Error fetching MMP3: IncompleteRead(36151296 bytes read)
Error fetching WNT2: IncompleteRead(121958400 bytes read)
Error fetching TRIB2: IncompleteRead(122757120 bytes read)
Error fetching RAP2A: IncompleteRead(91770880 bytes read)
Error fetching POLR3K: IncompleteRead(77524992 bytes read)
Error fetching LGALS1: IncompleteRead(28577792 bytes read)
Error fetching SPATS2L: IncompleteRead(79675392 bytes read)
Error fetching GLRX: IncompleteRead(77508608 bytes read)
Error fetching FKBP5: IncompleteRead(43651072 bytes read)
Error fetching STK24: IncompleteRead(10235904 bytes read)
Error fetching RAP1B: IncompleteRead(27213824 bytes read)
Error fetching TLN2: IncompleteRead(21323776 bytes read)
Error fetching RPTOR: IncompleteRead(17362944 bytes read)
Error fetching TIMP2: IncompleteRead(13307904 bytes read)
Error fetching WNT10B: IncompleteRead(31256576 bytes read)
Error fetching PTGER2: IncompleteRead(25518080 bytes read)
Error fetching FLJ22555: IncompleteRead(5140480 bytes read)
Error fetching CA3: IncompleteRead(28971008 bytes read)
Error fetching RXRA: IncompleteRead(70111232 bytes read)
Error fetching IL6ST: IncompleteRead(149897216 bytes read)
Error fetching POU5F1: IncompleteRead(108421120 bytes read)
Error fetching CITED4: IncompleteRead(114507776 bytes read)
Error fetching DYRK2: IncompleteRead(41107456 bytes read)
Error fetching MAP1B: IncompleteRead(41226240 bytes read)
Error fetching HDAC9: IncompleteRead(42426368 bytes read)
Error fetching ME1: IncompleteRead(31883264 bytes read)
Error fetching RTN4: IncompleteRead(12463978 bytes read)
Error fetching FOXN3: IncompleteRead(52731904 bytes read)
Error fetching MAGED2: IncompleteRead(30130176 bytes read)
Error fetching CALM3: IncompleteRead(2240512 bytes read)
Error fetching ITM2A: IncompleteRead(70090752 bytes read)
Error fetching DNTTIP2: IncompleteRead(82726912 bytes read)
Error fetching PAQR9: IncompleteRead(32595968 bytes read)
Error fetching STOM: IncompleteRead(14168064 bytes read)
Error fetching MAN1C1: IncompleteRead(42246144 bytes read)
Error fetching UBL3: IncompleteRead(51806208 bytes read)
Error fetching CHCHD4: IncompleteRead(35942400 bytes read)
Error fetching RBM15: IncompleteRead(30986240 bytes read)
Error fetching SPTSSA: IncompleteRead(32428032 bytes read)
Error fetching EEF2K: IncompleteRead(59785216 bytes read)
Error fetching GGH: IncompleteRead(71614464 bytes read)
Error fetching ALOX12: IncompleteRead(5042176 bytes read)
Error fetching CD28: IncompleteRead(64241664 bytes read)
Error fetching SEMA6D: IncompleteRead(30158848 bytes read)
Error fetching RBM3: IncompleteRead(98680832 bytes read)
Error fetching ANXA7: IncompleteRead(120233984 bytes read)
Error fetching GNA13: IncompleteRead(82116608 bytes read)
Error fetching ADH1A: IncompleteRead(84508672 bytes read)
Error fetching GNG11: IncompleteRead(49631232 bytes read)
Error fetching LPIN1: IncompleteRead(95023104 bytes read)
Error fetching PHLDA1: IncompleteRead(10612736 bytes read)
Error fetching SORD: IncompleteRead(43257856 bytes read)
Error fetching ATP5MC3: IncompleteRead(37822464 bytes read)
Error fetching TBX3: IncompleteRead(54087680 bytes read)
Error fetching PGK1: IncompleteRead(14262272 bytes read)
Error fetching TXNRD1: IncompleteRead(17760256 bytes read)
"""

# Extract gene names from error messages
import re

# Regular expression to extract gene names
gene_names = re.findall(r"Error fetching (\w+):", error_messages)

# Save gene names to a separate file
output_file_path = "incomplete_genes.txt"
with open(output_file_path, "w") as file:
    file.write("\n".join(gene_names))

output_file_path


'incomplete_genes.txt'

In [18]:
from Bio import Entrez
from http.client import IncompleteRead
import time


# Input file containing gene names
gene_file = "incomplete_genes.txt"

# Output FASTA file for large genes
output_fasta = "large_genes_nucleotide.fasta"

# Function to fetch sequences robustly with retries
def robust_fetch(fetch_function, retries=3, wait_time=5):
    for attempt in range(retries):
        try:
            return fetch_function()
        except IncompleteRead as e:
            print(f"IncompleteRead error: {e}. Retrying {attempt + 1}/{retries}...")
            time.sleep(wait_time)
    raise Exception("Failed to fetch data after multiple retries")

# Fetch sequences in chunks
def fetch_large_gene_sequence(gene_name):
    try:
        # Search for the gene in the NCBI nucleotide database (restricted to Homo sapiens)
        query = f"{gene_name}[Gene] AND Homo sapiens[Organism]"
        handle = Entrez.esearch(db="nucleotide", term=query, retmax=1)
        record = Entrez.read(handle)
        handle.close()

        # Fetch sequence by ID in chunks if available
        if record["IdList"]:
            seq_id = record["IdList"][0]
            print(f"Fetching large gene {gene_name} (ID: {seq_id})...")
            handle = robust_fetch(
                lambda: Entrez.efetch(
                    db="nucleotide", id=seq_id, rettype="fasta", retmode="text", chunk_size=1024
                )
            )
            sequence = handle.read()
            handle.close()
            return sequence
        else:
            print(f"No sequence found for {gene_name}")
            return None
    except Exception as e:
        print(f"Error fetching {gene_name}: {e}")
        return None

# Read gene names and fetch sequences
with open(gene_file, "r") as file:
    genes = [line.strip() for line in file if line.strip()]

with open(output_fasta, "w") as fasta_out:
    for gene in genes:
        sequence = fetch_large_gene_sequence(gene)
        if sequence:
            fasta_out.write(sequence)

print(f"Sequences for large genes saved to: {output_fasta}")


Fetching large gene ZFYVE28 (ID: 2194974292)...
Fetching large gene NRXN1 (ID: 2194974693)...
Fetching large gene HNRNPA1 (ID: 2194973269)...
Fetching large gene SIGLEC12 (ID: 2194972797)...
Fetching large gene IL1RN (ID: 2194974693)...
Fetching large gene ASIC2 (ID: 2194972897)...
Fetching large gene KCNA5 (ID: 2194973269)...
Fetching large gene NOTCH4 (ID: 2194974009)...
Fetching large gene TNFSF8 (ID: 2194973615)...
Fetching large gene TGFA (ID: 2194974693)...
Fetching large gene SVEP1 (ID: 2194973615)...
Fetching large gene MACF1 (ID: 2194974903)...
Fetching large gene PYHIN1 (ID: 2194974903)...
Fetching large gene AHSA2P (ID: 2194974693)...
Fetching large gene CXCL2 (ID: 2194974292)...
Fetching large gene PPIC (ID: 2194974170)...
Fetching large gene C1QB (ID: 2194974903)...
Fetching large gene MMP3 (ID: 2194973393)...
Fetching large gene WNT2 (ID: 2194973865)...
Fetching large gene TRIB2 (ID: 2194974693)...
Fetching large gene RAP2A (ID: 2194973193)...
Fetching large gene POLR3K (

In [19]:
from Bio import SeqIO

input_file = "aging_related_genes_human_nucleotide.fasta"
output_prefix = "chunk"
num_chunks = 30

# Count total number of sequences
with open(input_file) as f:
    total_sequences = sum(1 for _ in SeqIO.parse(f, "fasta"))

# Calculate number of sequences per chunk
sequences_per_chunk = (total_sequences + num_chunks - 1) // num_chunks  # Round up

# Split into chunks
with open(input_file) as f:
    records = list(SeqIO.parse(f, "fasta"))
    for i in range(num_chunks):
        chunk_records = records[i * sequences_per_chunk: (i + 1) * sequences_per_chunk]
        output_file = f"{output_prefix}_{i + 1}.fasta"
        with open(output_file, "w") as out_f:
            SeqIO.write(chunk_records, out_f, "fasta")
        print(f"Written {len(chunk_records)} sequences to {output_file}")


Written 77 sequences to chunk_1.fasta
Written 77 sequences to chunk_2.fasta
Written 77 sequences to chunk_3.fasta
Written 77 sequences to chunk_4.fasta
Written 77 sequences to chunk_5.fasta
Written 77 sequences to chunk_6.fasta
Written 77 sequences to chunk_7.fasta
Written 77 sequences to chunk_8.fasta
Written 77 sequences to chunk_9.fasta
Written 77 sequences to chunk_10.fasta
Written 77 sequences to chunk_11.fasta
Written 77 sequences to chunk_12.fasta
Written 77 sequences to chunk_13.fasta


OSError: [Errno 28] No space left on device

In [None]:
from Bio import Entrez
from http.client import IncompleteRead
import time


# Input file containing gene names
gene_file = "opengenes/1.txt"

# Output FASTA file for large genes
output_fasta = "opengenes1.fasta"

# Function to fetch sequences robustly with retries
def robust_fetch(fetch_function, retries=3, wait_time=5):
    for attempt in range(retries):
        try:
            return fetch_function()
        except IncompleteRead as e:
            print(f"IncompleteRead error: {e}. Retrying {attempt + 1}/{retries}...")
            time.sleep(wait_time)
    raise Exception("Failed to fetch data after multiple retries")

# Fetch sequences in chunks
def fetch_large_gene_sequence(gene_name):
    try:
        # Search for the gene in the NCBI nucleotide database (restricted to Homo sapiens)
        query = f"{gene_name}[Gene] AND Homo sapiens[Organism]"
        handle = Entrez.esearch(db="nucleotide", term=query, retmax=1)
        record = Entrez.read(handle)
        handle.close()

        # Fetch sequence by ID in chunks if available
        if record["IdList"]:
            seq_id = record["IdList"][0]
            print(f"Fetching large gene {gene_name} (ID: {seq_id})...")
            handle = robust_fetch(
                lambda: Entrez.efetch(
                    db="nucleotide", id=seq_id, rettype="fasta", retmode="text", chunk_size=1024
                )
            )
            sequence = handle.read()
            handle.close()
            return sequence
        else:
            print(f"No sequence found for {gene_name}")
            return None
    except Exception as e:
        print(f"Error fetching {gene_name}: {e}")
        return None

# Read gene names and fetch sequences
with open(gene_file, "r") as file:
    genes = [line.strip() for line in file if line.strip()]

with open(output_fasta, "w") as fasta_out:
    for gene in genes:
        sequence = fetch_large_gene_sequence(gene)
        if sequence:
            fasta_out.write(sequence)

print(f"Sequences for large genes saved to: {output_fasta}")

In [1]:
from Bio import Entrez
from http.client import IncompleteRead
import time


# Input file containing gene names
gene_file = "opengenes/1.txt"

# Output FASTA file for large genes (protein sequences)
output_fasta = "opengenes/opengenes1_proteins.fasta"

# Function to fetch sequences robustly with retries
def robust_fetch(fetch_function, retries=3, wait_time=5):
    for attempt in range(retries):
        try:
            return fetch_function()
        except IncompleteRead as e:
            print(f"IncompleteRead error: {e}. Retrying {attempt + 1}/{retries}...")
            time.sleep(wait_time)
    raise Exception("Failed to fetch data after multiple retries")

# Fetch protein sequences from NCBI
def fetch_protein_sequence(gene_name):
    try:
        # Search for the gene in the NCBI protein database (restricted to Homo sapiens)
        query = f"{gene_name}[Gene] AND Homo sapiens[Organism]"
        handle = Entrez.esearch(db="protein", term=query, retmax=1)
        record = Entrez.read(handle)
        handle.close()

        # Fetch the protein sequence by ID
        if record["IdList"]:
            seq_id = record["IdList"][0]
            print(f"Fetching protein sequence for {gene_name} (ID: {seq_id})...")
            handle = robust_fetch(
                lambda: Entrez.efetch(
                    db="protein", id=seq_id, rettype="fasta", retmode="text"
                )
            )
            sequence = handle.read()
            handle.close()
            return sequence
        else:
            print(f"No protein sequence found for {gene_name}")
            return None
    except Exception as e:
        print(f"Error fetching {gene_name}: {e}")
        return None

# Read gene names and fetch protein sequences
with open(gene_file, "r") as file:
    genes = [line.strip() for line in file if line.strip()]

with open(output_fasta, "w") as fasta_out:
    for gene in genes:
        protein_sequence = fetch_protein_sequence(gene)
        if protein_sequence:
            fasta_out.write(protein_sequence)

print(f"Protein sequences saved to: {output_fasta}")


            Email address is not specified.

            To make use of NCBI's E-utilities, NCBI requires you to specify your
            email address with each request.  As an example, if your email address
            is A.N.Other@example.com, you can specify it as follows:
               from Bio import Entrez
               Entrez.email = 'A.N.Other@example.com'
            In case of excessive usage of the E-utilities, NCBI will attempt to contact
            a user at the email address provided before blocking access to the
            E-utilities.


Fetching protein sequence for GHR (ID: 335057499)...
Fetching protein sequence for GHRH (ID: 134521)...
Fetching protein sequence for SHC1 (ID: 2462512603)...
Fetching protein sequence for POU1F1 (ID: 123408)...
Fetching protein sequence for PROP1 (ID: 1844084115)...
Fetching protein sequence for TP53 (ID: 2246031126)...
No protein sequence found for TERC
Fetching protein sequence for TERT (ID: 301129200)...
Fetching protein sequence for ATM (ID: 1199277043)...
Fetching protein sequence for PLAU (ID: 1788951008)...
Fetching protein sequence for ERCC2 (ID: 195947407)...
Fetching protein sequence for ERCC8 (ID: 590122050)...
Fetching protein sequence for WRN (ID: 110735439)...
Fetching protein sequence for LMNA (ID: 2244986507)...
Fetching protein sequence for IGF1R (ID: 629266062)...
Fetching protein sequence for TXN (ID: 135773)...
Fetching protein sequence for KL (ID: 24497614)...
Fetching protein sequence for E2F1 (ID: 12669911)...
Fetching protein sequence for PTPN11 (ID: 1757649834

[text](command:cellOutput.enableScrolling?ce815a5e-54dd-41e9-93b2-fe8ac5797b51)No protein sequence found for TERC

In [1]:
from Bio import SeqIO

# Input and output file paths
protein_fasta = "realmergedprots.faa"
cds_fasta = "cds\with_tags\merged_cds.fasta"
output_fasta = "mergedprotscds.fasta"

# Function to extract protein ID from a header line
def extract_protein_id(header):
    return header.split()[0][2:]  # Remove the prefix (e.g., "rf") and take the ID

# Load CDS sequences into a dictionary for quick lookup
cds_dict = {}
for record in SeqIO.parse(cds_fasta, "fasta"):
    description = record.description
    # Extract the protein ID from the CDS file (e.g., XP_036991360.2)
    if "[protein_id=" in description:
        protein_id = description.split("[protein_id=")[1].split("]")[0]
        cds_dict[protein_id] = record

# Process the protein file and match IDs
with open(output_fasta, "w") as output_handle:
    for prot_record in SeqIO.parse(protein_fasta, "fasta"):
        # Extract the protein ID from the protein FASTA file
        prot_id = extract_protein_id(prot_record.id)
        
        # Check if the protein ID exists in the CDS dictionary
        if prot_id in cds_dict:
            # Get the matching CDS record
            cds_record = cds_dict[prot_id]
            
            # Modify the header to use the protein FASTA header
            cds_record.id = prot_record.id
            cds_record.description = prot_record.description
            
            # Write the updated CDS record to the output file
            SeqIO.write(cds_record, output_handle, "fasta")
        else:
            print(f"Protein ID {prot_id} not found in CDS file.")

print(f"Matched CDS sequences saved to: {output_fasta}")


  cds_fasta = "cds\with_tags\merged_cds.fasta"


Matched CDS sequences saved to: mergedprotscds.fasta
