# Step 1: Import file

Please change the file_path to wherever it is on your machine

*Ensure it is in quotes and uses \ \

In [3]:
# !pip install gzip
# !pip install pandas

import gzip
import pandas as pd

# Path to the uploaded file - Change to yours
file_path = "C:\\Users\\anani\\Downloads\\Homo_sapiens.gene_info.gz"

# Reading the .gz file
with gzip.open(file_path, 'rt') as f:
    # Load the file into a pandas dataframe
    gene_info_df = pd.read_csv(f, sep='\t')

# Display the first few rows to understand its structure
gene_info_df.head()


Unnamed: 0,#tax_id,GeneID,Symbol,LocusTag,Synonyms,dbXrefs,chromosome,map_location,description,type_of_gene,Symbol_from_nomenclature_authority,Full_name_from_nomenclature_authority,Nomenclature_status,Other_designations,Modification_date,Feature_type
0,9606,1,A1BG,-,A1B|ABG|GAB|HYST2477,MIM:138670|HGNC:HGNC:5|Ensembl:ENSG00000121410...,19,19q13.43,alpha-1-B glycoprotein,protein-coding,A1BG,alpha-1-B glycoprotein,O,alpha-1B-glycoprotein|HEL-S-163pA|epididymis s...,20240827,-
1,9606,2,A2M,-,A2MD|CPAMD5|FWP007|S863-7,MIM:103950|HGNC:HGNC:7|Ensembl:ENSG00000175899...,12,12p13.31,alpha-2-macroglobulin,protein-coding,A2M,alpha-2-macroglobulin,O,alpha-2-macroglobulin|C3 and PZP-like alpha-2-...,20240827,-
2,9606,3,A2MP1,-,A2MP,HGNC:HGNC:8|Ensembl:ENSG00000291190|AllianceGe...,12,12p13.31,alpha-2-macroglobulin pseudogene 1,pseudo,A2MP1,alpha-2-macroglobulin pseudogene 1,O,pregnancy-zone protein pseudogene,20240827,-
3,9606,9,NAT1,-,AAC1|MNAT|NAT-1|NATI,MIM:108345|HGNC:HGNC:7645|Ensembl:ENSG00000171...,8,8p22,N-acetyltransferase 1,protein-coding,NAT1,N-acetyltransferase 1,O,arylamine N-acetyltransferase 1|N-acetyltransf...,20240827,-
4,9606,10,NAT2,-,AAC2|NAT-2|PNAT,MIM:612182|HGNC:HGNC:7646|Ensembl:ENSG00000156...,8,8p22,N-acetyltransferase 2,protein-coding,NAT2,N-acetyltransferase 2,O,arylamine N-acetyltransferase 2|N-acetyltransf...,20240827,-


# Step 2: Preprocess

Run this to filter the data down to ~40,000 rows

*Ensure the output is 39,933

In [2]:
# Step 1: Preprocess the gene data based on the user's conditions

# Condition 1: Remove rows where 'Feature_type' starts with 'regulatory:silencer' or 'regulatory:enhancer'
condition1 = ~gene_info_df['Feature_type'].str.startswith(('regulatory:silencer', 'regulatory:enhancer'), na=False)

# Condition 2: Keep only rows where 'type_of_gene' is 'protein-coding', 'unknown', 'other', or 'pseudo'
condition2 = gene_info_df['type_of_gene'].str.contains('protein-coding|unknown|other|pseudo', case=False, na=False)

# Condition 3: Remove rows that contain 'tRNA' in any of the columns
condition3 = ~gene_info_df.apply(lambda row: row.astype(str).str.contains('tRNA', case=False, na=False).any(), axis=1)

# Apply all conditions to filter the DataFrame
filtered_gene_info_df = gene_info_df[condition1 & condition2 & condition3]

# Get the total number of rows after filtering
total_rows_after_filtering = filtered_gene_info_df.shape[0]

# Display the total number of rows after filtering
total_rows_after_filtering


39933

# Step 3: Scrape Genes

In [None]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import random

# Function to scrape gene summary from the NCBI gene page
def scrape_gene_summary(gene_id):
    url = f"https://www.ncbi.nlm.nih.gov/gene/{gene_id}"
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the correct <dd> tag containing the summary
        summary_section = soup.find_all('dd')  # Find all <dd> tags

        # Based on previous info, the summary is in the 10th <dd> tag (index 9)
        if len(summary_section) >= 10:
            summary_text = summary_section[9].text.strip()  # Access the 10th <dd> tag (index 9)
            return summary_text
        else:
            return "NA"  # If fewer than 10 <dd> tags, return 'NA'
    else:
        return "NA"  # Return 'NA' if the page request fails

# Path to save the final CSV file for 39k+ genes
output_file_path = "C:\\Users\\anani\\Downloads\\ncbi_summaries.csv"

# Process in batches to avoid being blocked
def process_in_batches(gene_data, batch_size=500, sleep_between_requests=1.5, sleep_between_batches=30):
    start_index = 0

    for start in range(0, len(gene_data), batch_size):
        batch = gene_data[start:start + batch_size].copy()  # Copy batch data to avoid SettingWithCopyWarning
        batch_summaries = []

        for index, row in batch.iterrows():
            gene_id = str(row['GeneID'])
            summary = scrape_gene_summary(gene_id)
            batch_summaries.append(summary)
            
            # Sleep between each request to avoid hitting API rate limits
            time.sleep(random.uniform(sleep_between_requests, sleep_between_requests + 0.5))  # Randomized sleep between 1.5 - 2 seconds

        # Add summaries to the batch and append to CSV
        batch['Summary'] = batch_summaries
        batch[['GeneID', 'Symbol', 'description', 'Summary']].to_csv(output_file_path, mode='a', header=(start == 0), index=False)
        
        # Print progress and sleep between batches
        print(f"Processed batch from {start} to {start + batch_size}. Taking a break.")
        time.sleep(sleep_between_batches)

# Main function for 39k+ genes
def main():
    # Assuming filtered_gene_info_df is already filtered as per your previous logic
    filtered_gene_info_df['Summary'] = 'NA'  # Initialize the Summary column

    # Set the batch size and delay parameters
    batch_size = 500
    sleep_between_requests = 1.5  # 1.5 seconds between requests
    sleep_between_batches = 30    # 30 seconds between batches

    # Process the gene data in batches and scrape the summaries
    process_in_batches(filtered_gene_info_df[['GeneID', 'Symbol', 'description']], batch_size=batch_size, sleep_between_requests=sleep_between_requests, sleep_between_batches=sleep_between_batches)

    # Final message when all batches are done
    print(f"Final summaries saved to {output_file_path}")

if __name__ == "__main__":
    main()

# Do Not Run Cell Below

This was a test trial to extract only 15 genes

In [None]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd

# Function to scrape gene summary from the NCBI gene page
def scrape_gene_summary(gene_id):
    url = f"https://www.ncbi.nlm.nih.gov/gene/{gene_id}"
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the correct <dd> tag containing the summary
        summary_section = soup.find_all('dd')  # Find all <dd> tags

        # Based on previous info, the summary is in the 10th <dd> tag (index 9)
        if len(summary_section) >= 10:
            summary_text = summary_section[9].text.strip()  # Access the 10th <dd> tag (index 9)
            return summary_text
        else:
            return "NA"  # If fewer than 10 <dd> tags, return 'NA'
    else:
        return "NA"  # Return 'NA' if the page request fails

# Path to save the final CSV file for first 3 genes
output_file_path = "C:\\Users\\anani\\Downloads\\ncbi_summaries_first3.csv"

# Process only the first 3 genes
def process_first_three_genes(gene_data):
    # Select the first 3 genes and only necessary columns
    first_three_genes = gene_data[['GeneID', 'Symbol', 'description']].head(10).copy()
    
    summaries = []
    
    for index, row in first_three_genes.iterrows():
        gene_id = str(row['GeneID'])
        summary = scrape_gene_summary(gene_id)
        summaries.append(summary)
        
        # Sleep between requests to avoid hitting rate limits
        time.sleep(1.5)  # 1.5 seconds between requests
    
    # Add the summaries to the dataframe and save the results
    first_three_genes['Summary'] = summaries
    first_three_genes.to_csv(output_file_path, index=False)

    print(f"Summaries for first 3 genes saved to {output_file_path}")

# Main function for first 3 genes
def main():
    # Assuming filtered_gene_info_df is already filtered as per your previous logic
    process_first_three_genes(filtered_gene_info_df)

if __name__ == "__main__":
    main()
