In [53]:
import polars as pl
import requests
import time

In [67]:
'''Loading and filter the data for testability'''

# Load the Excel file.
file_path = "Clingen-Curation-Activity-Summary-Report-2025-03-01.xlsx"

# Read the Excel file (all sheets).
df_dict = pl.read_excel(file_path, sheet_id=0)

# Acessing first sheet.
df_dict = df_dict['Clingen-Curation-Activity-Summa']

# Printing first few rows.
print(df_dict.head())

# Filtering to manageable rows.
df_filtered = df_dict[:50]

Could not determine dtype for column 4, falling back to string


shape: (5, 17)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ gene_symb ┆ hgnc_id   ┆ gene_url  ┆ disease_l ┆ … ┆ gene_dise ┆ actionabi ┆ actionabi ┆ actionab │
│ ol        ┆ ---       ┆ ---       ┆ abel      ┆   ┆ ase_valid ┆ lity_asse ┆ lity_asse ┆ ility_gr │
│ ---       ┆ str       ┆ str       ┆ ---       ┆   ┆ ity_gceps ┆ rtion_cla ┆ rtion_rep ┆ oups     │
│ str       ┆           ┆           ┆ str       ┆   ┆ ---       ┆ ssi…      ┆ ort…      ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆ str       ┆ ---       ┆ ---       ┆ str      │
│           ┆           ┆           ┆           ┆   ┆           ┆ str       ┆ str       ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ A2ML1     ┆ HGNC:2333 ┆ https://s ┆ Noonan    ┆ … ┆ RASopathy ┆ null      ┆ null      ┆ null     │
│           ┆ 6         ┆ earch.cli ┆ syndrome  ┆   ┆           ┆           

In [59]:
def fetch_abstracts_semanticscholar(gene, disease, hgnc_id, mondo_id, max_results=5, retries=3, delay=5):
    
    """Fetch abstracts from Semantic Scholar for a gene-disease relationship."""

    query = f"{gene} {disease}"
    url = f"https://api.semanticscholar.org/graph/v1/paper/search?query={query}&fields=title,abstract,year,journal,externalIds&limit={max_results}"

    for attempt in range(retries):
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()  # Raise error if request fails

            data = response.json()
            articles = []

            for result in data.get("data", []):
                articles.append({
                    "gene_symbol": gene,
                    "hgnc_id": hgnc_id,
                    "disease_label": disease,
                    "mondo_id": mondo_id,
                    "title": result.get("title", "N/A"),
                    "abstract": result.get("abstract", "N/A"),
                    "DOI": result.get("externalIds", {}).get("DOI", "N/A"),
                })

            return articles

        except requests.exceptions.RequestException as e:
            print(f"Error fetching data for {gene}-{disease}: {e}")
            if attempt < retries - 1:
                time.sleep(delay)
            else:
                return []

In [69]:
# Collect abstracts for all gene-disease relationships
all_articles = []

for row in df_filtered.iter_rows(named=True):  # Iterate over rows as dictionaries
    gene_symbol = row["gene_symbol"]
    disease_label = row["disease_label"]
    hgnc_id = row["hgnc_id"]
    mondo_id = row["mondo_id"]

    articles = fetch_abstracts_semanticscholar(gene_symbol, disease_label, hgnc_id, mondo_id, max_results=10)
    all_articles.extend(articles)

# Convert results into a Polars DataFrame
df_abstracts = pl.DataFrame(all_articles)

# Display results.
print(f"Saved {len(df_abstracts)} abstracts to gene_disease_abstracts.csv")
print(df_abstracts.head())

Error fetching data for A2ML1-Noonan syndrome: 429 Client Error:  for url: https://api.semanticscholar.org/graph/v1/paper/search?query=A2ML1%20Noonan%20syndrome&fields=title,abstract,year,journal,externalIds&limit=10
Error fetching data for AARS1-Charcot-Marie-Tooth disease axonal type 2N: HTTPSConnectionPool(host='api.semanticscholar.org', port=443): Read timed out.
Error fetching data for AARS1-Charcot-Marie-Tooth disease axonal type 2N: 429 Client Error:  for url: https://api.semanticscholar.org/graph/v1/paper/search?query=AARS1%20Charcot-Marie-Tooth%20disease%20axonal%20type%202N&fields=title,abstract,year,journal,externalIds&limit=10
Error fetching data for AARS2-leukoencephalopathy, progressive, with ovarian failure: 429 Client Error:  for url: https://api.semanticscholar.org/graph/v1/paper/search?query=AARS2%20leukoencephalopathy,%20progressive,%20with%20ovarian%20failure&fields=title,abstract,year,journal,externalIds&limit=10
Error fetching data for AASS-hyperlysinemia: 429 Cli

In [73]:
# Save to CSV file
df_abstracts.write_csv("Semantic_Scholar_Gene_Disease_Abstracts.csv")