In [1]:
! pip install Bio

Collecting Bio
  Downloading bio-1.8.1-py3-none-any.whl.metadata (5.7 kB)
Collecting biopython>=1.80 (from Bio)
  Downloading biopython-1.86-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting gprofiler-official (from Bio)
  Downloading gprofiler_official-1.0.0-py3-none-any.whl.metadata (11 kB)
Collecting mygene (from Bio)
  Downloading mygene-3.2.2-py2.py3-none-any.whl.metadata (10 kB)
Collecting pooch (from Bio)
  Downloading pooch-1.8.2-py3-none-any.whl.metadata (10 kB)
Collecting biothings-client>=0.2.6 (from mygene->Bio)
  Downloading biothings_client-0.4.1-py3-none-any.whl.metadata (10 kB)
Downloading bio-1.8.1-py3-none-any.whl (321 kB)
Downloading biopython-1.86-cp312-cp312-win_amd64.whl (2.7 MB)
   ---------------------------------------- 0.0/2.7 MB ? eta -:--:--
   --------------- ------------------------ 1.0/2.7 MB 25.4 MB/s eta 0:00:01
   --------------- ------------------------ 1.0/2.7 MB 25.4 MB/s eta 0:00:01
   --------------- ------------------------ 1.0/2.7 MB 25.4 MB

In [4]:
import pandas as pd
data = pd.read_csv("singscore_new_code/UK65_clean_gene_tpm.txt", sep="\t")
data

Unnamed: 0,GeneName,TPM,log2_TPM
0,7SK,131488.067500,17.004583
1,A1BG,42.941644,5.457517
2,A1BG-AS1,151.389341,7.251618
3,A1CF,1.277293,1.187320
4,A2M,422.318386,8.725599
...,...,...,...
24505,ZYG11B,15.132757,4.011921
24506,ZYX,239.063682,7.907273
24507,ZZEF1,91.359184,6.529184
24508,ZZZ3,264.379261,8.051912


In [None]:
from Bio import Entrez
Entrez.email = "jnanshi@genesilico.ai"  # NCBI requires this

# List of gene symbols to convert
gene_symbols = data["GeneName"].to_list()
species = "Homo sapiens"

# Store mappings here
entrez_ids = {}

# Query NCBI for each gene symbol
for symbol in gene_symbols:
    print(symbol)
    handle = Entrez.esearch(db="gene", term=f"{symbol}[sym] AND {species}[orgn]")
    record = Entrez.read(handle)
    if record["IdList"]:
        entrez_ids[symbol] = record["IdList"][0]
    else:
        entrez_ids[symbol] = None

# Convert dictionary ‚Üí DataFrame
df = pd.DataFrame(list(entrez_ids.items()), columns=["Symbol", "EntrezID"])

# Display or save
print(df)
# Optionally save to file
df.to_csv("gene_symbol_to_entrez_UK65.csv", index=False)

KeyboardInterrupt: 

In [11]:
from Bio import Entrez
import pandas as pd
from tqdm import tqdm
import time

# Always tell NCBI who you are
Entrez.email = "jnanshi@genesilico.ai"  # NCBI requires this

# Get list of gene symbols from your data
gene_symbols = data["GeneName"].tolist()
species = "Homo sapiens"

entrez_ids = {}

print(f"\nüîç Starting Entrez ID lookup for {len(gene_symbols)} genes...\n")

# tqdm progress bar
for symbol in tqdm(gene_symbols, desc="Processing genes", unit="gene"):
    try:
        # Query NCBI Gene database
        handle = Entrez.esearch(db="gene", term=f"{symbol}[sym] AND {species}[orgn]")
        record = Entrez.read(handle)
        handle.close()

        if record["IdList"]:
            entrez_ids[symbol] = record["IdList"][0]
        else:
            entrez_ids[symbol] = None
            print(f"‚ö†Ô∏è  No Entrez ID found for: {symbol}")

        # Optional: small delay to avoid NCBI rate-limit
        time.sleep(0.03)

    except Exception as e:
        print(f"‚ùå Error fetching {symbol}: {e}")
        entrez_ids[symbol] = None
        time.sleep(0.5)

# Convert dictionary to DataFrame
df = pd.DataFrame(list(entrez_ids.items()), columns=["Symbol", "EntrezID"])

# Summary
n_found = df['EntrezID'].notna().sum()
n_missing = df['EntrezID'].isna().sum()

print(f"\n‚úÖ Done! Found Entrez IDs for {n_found}/{len(df)} genes "
      f"({n_missing} missing).\n")

# Save to file
output_file = "gene_symbol_to_entrez_UK65.csv"
df.to_csv(output_file, index=False)
print(f"üíæ Saved results to: {output_file}")



üîç Starting Entrez ID lookup for 24510 genes...



Processing genes:   0%|          | 20/24510 [00:19<6:46:25,  1.00gene/s]


KeyboardInterrupt: 