In [None]:
import os
import json
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm
from Bio import Entrez
import openai

with open("/Users/julia_patsiukova/Downloads/api_keys/api_lens_org.txt", "r") as file:
    openai_api_key = file.read().strip()

openai.api_key = openai_api_key
Entrez.email = "julia.patsyukova@gmail.com"
MODEL = "gpt-4o"

queries = ["antisense oligonucleotides", "ASO"]
MAX_RESULTS = 10

columns = [
    "source_type", "name", "id", "target_gene", "exon_or_intron", "number_exon_intron", "target_variant",
    "species", "cell_line", "delivery_approach", "aso_type", "oligo_sequence", "modification",
    "modification_mask", "concentration", "concentration_unit", "type_of_efficiency", "efficiency",
    "efficiency_units", "efficiency_text", "comment", "DNA.RNA"
]


def search_pmc(query):
    handle = Entrez.esearch(db="pmc", term=query, retmax=MAX_RESULTS)
    record = Entrez.read(handle)
    return record["IdList"]

def fetch_full_article(pmcid):
    url = f"https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&identifier=oai:pubmedcentral.nih.gov:{pmcid}&metadataPrefix=pmc"
    response = requests.get(url)
    if response.status_code != 200:
        return None

    soup = BeautifulSoup(response.content, "xml")
    title = soup.find("article-title")
    abstract = soup.find("abstract")
    authors = soup.find_all("contrib", {"contrib-type": "author"})
    date = soup.find("pub-date")
    body = soup.find("body")

    title_text = title.get_text(separator=" ") if title else ""
    abstract_text = abstract.get_text(separator=" ") if abstract else ""
    body_text = body.get_text(separator="\n") if body else ""
    full_text = f"{abstract_text}\n\n{body_text}"
    pub_date = ""
    if date:
        year = date.find("year").text if date.find("year") else ""
        month = date.find("month").text if date.find("month") else ""
        day = date.find("day").text if date.find("day") else ""
        pub_date = f"{year}-{month}-{day}".strip("-")

    return {
        "pmcid": f"PMC{pmcid}",
        "title": title_text,
        "date": pub_date,
        "full_text": full_text
    }

def ask_openai_for_table(description):
    try:
        system_msg = (
            "You are an expert bioinformatician. Extract the following fields from the given scientific text "
            "about antisense oligonucleotides: target_gene, exon_or_intron, number_exon_intron, target_variant, "
            "species, cell_line, delivery_approach, aso_type, oligo_sequence, modification, modification_mask, "
            "concentration, concentration_unit, type_of_efficiency, efficiency, efficiency_units, efficiency_text, "
            "comment, DNA.RNA."
        )

        user_msg = (
            f"Text: {description}\n\n"
            "If no information for a field, write 'N/A'. Format your answer as a markdown table matching the column names."
        )

        response = openai.chat.completions.create(
            model=MODEL,
            messages=[
                {"role": "system", "content": system_msg},
                {"role": "user", "content": user_msg}
            ]
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"ERROR: {str(e)}"

def parse_markdown_table(md_text):
    try:
        lines = md_text.strip().split("\n")
        values = lines[2].split("|")[1:-1]
        return [val.strip() for val in values]
    except Exception:
        return ["N/A"] * (len(columns) - 3)

final_records = []
seen_pmcs = set()

for query in queries:
    pmc_ids = search_pmc(query)
    for pmcid in tqdm(pmc_ids):
        if pmcid in seen_pmcs:
            continue
        article = fetch_full_article(pmcid)
        if not article:
            continue

        description = article['full_text'].strip()
        if not description:
            values = ["No text"] * (len(columns) - 3)
        else:
            md_response = ask_openai_for_table(description)
            if "ERROR" in md_response:
                values = ["Error"] * (len(columns) - 3)
            else:
                values = parse_markdown_table(md_response)

        row = {
            "source_type": "article",
            "name": article["title"],
            "id": article["pmcid"]
        }
        for col, val in zip(columns[3:], values):
            row[col] = val

        final_records.append(row)
        seen_pmcs.add(pmcid)
        time.sleep(1)

df_final = pd.DataFrame(final_records)
print("All done")
