<a href="https://colab.research.google.com/github/MohanVarmaDhana/Dhanamo-AI/blob/main/Semantic%20search%20engine%20using%20Langchain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests

# Function to query Europe PMC API
def query_europe_pmc(query, max_results=5):
    base_url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
    params = {
        "query": query,
        "resultType": "core",  # 'core' gives the main information like title, authors, etc.
        "pageSize": max_results,
        "format": "json"  # Get data in JSON format
    }

    # Send GET request
    response = requests.get(base_url, params=params)
    data = response.json()
    print(data)

    # Check if results are returned
    if data['hitCount'] > 0:
        for i, doc in enumerate(data['resultList']['result']):
            print(f"Paper {i+1}:")
            print(f"Title: {doc.get('title', 'N/A')}")
           # print(f"Authors: {', '.join([author['name'] for author in doc.get('authorList', {}).get('author', [])]) if 'authorList' in doc else 'N/A'}")
            print(f"Abstract: {doc.get('abstractText', 'N/A')}")

            # Check for full-text URL
            full_text_url = doc.get('fullTextUrl', None)
            if full_text_url:
                print(f"Full Text URL: {full_text_url}")
                download_paper(full_text_url)
            else:
                print("Full text not available.")

            print("-" * 80)
    else:
        print("No results found.")

# Function to download the full paper
def download_paper(url):
    try:
        # Check if the full text URL is a PDF or HTML link
        if url.endswith('.pdf'):
            response = requests.get(url)
            filename = url.split('/')[-1]  # Get filename from URL
            with open(filename, 'wb') as f:
                f.write(response.content)
            print(f"Downloaded paper as {filename}")
        elif url.endswith('.html'):
            response = requests.get(url)
            filename = "paper.html"
            with open(filename, 'w', encoding='utf-8') as f:
                f.write(response.text)
            print(f"Downloaded paper as {filename}")
        else:
            print("Unsupported file type.")
    except Exception as e:
        print(f"Failed to download the paper: {e}")

# Query example
query_europe_pmc("large language models in pharma", max_results=1)


{'version': '6.9', 'hitCount': 7756, 'nextCursorMark': 'AoIIQJn5fSg1MjY0MjM2Ng==', 'nextPageUrl': 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=large language models in pharma&cursorMark=AoIIQJn5fSg1MjY0MjM2Ng==&resultType=core&pageSize=1&format=json', 'request': {'queryString': 'large language models in pharma', 'resultType': 'core', 'cursorMark': '*', 'pageSize': 1, 'sort': '', 'synonym': False}, 'resultList': {'result': [{'id': '40112233', 'source': 'MED', 'pmid': '40112233', 'pmcid': 'PMC11949217', 'fullTextIdList': {'fullTextId': ['PMC11949217']}, 'doi': '10.1200/cci-24-00230', 'title': 'Large Language Models as Decision-Making Tools in Oncology: Comparing Artificial Intelligence Suggestions and Expert Recommendations.', 'authorString': 'Ah-Thiane L, Heudel PE, Campone M, Robert M, Brillaud-Meflah V, Rousseau C, Le Blanc-Onfroy M, Tomaszewski F, Supiot S, Perennec T, Mervoyer A, Frenel JS.', 'authorList': {'author': [{'fullName': 'Ah-Thiane L', 'firstName': 'Loic'

In [None]:
!nslookup api.europepmc.org

Server:		127.0.0.11
Address:	127.0.0.11#53

** server can't find api.europepmc.org: NXDOMAIN



In [7]:
import requests

# Function to query Europe PMC API
def query_europe_pmc(query, max_results=5):
    base_url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
    params = {
        "query": query,
        "resultType": "core",  # 'core' gives the main information like title, authors, etc.
        "pageSize": max_results,
        "format": "json"  # Get data in JSON format
    }

    # Send GET request
    response = requests.get(base_url, params=params)
    data = response.json()

    # Check if results are returned
    if data['hitCount'] > 0:
        for i, doc in enumerate(data['resultList']['result']):
            print(f"Paper {i+1}:")
            print(f"Title: {doc.get('title', 'N/A')}")
            #print(f"Authors: {', '.join([author['name'] for author in doc.get('authorList', {}).get('author', [])]) if 'authorList' in doc else 'N/A'}")
            #print(f"Abstract: {doc.get('abstractText', 'N/A')}")

            # Extract DOI or PMID
            pmid = doc.get('pmcid', None)
            doi = doc.get('doi', None)

            # Use PMID or DOI to get full text
            if pmid:
                print(f"PMID: {pmid}")
                fetch_full_text(pmid)
            elif doi:
                print(f"DOI: {doi}")
                fetch_full_text_by_doi(doi)
            else:
                print("No full text ID available.")
            print("-" * 80)
    else:
        print("No results found.")

# Function to fetch full text using PMID
def fetch_full_text(pmid):
    # Construct the full-text URL using PMID
    pmid = 'PMC'+pmid
    full_text_url = f"https://europepmc.org/articles/{pmid}?pdf=render"
    download_paper(full_text_url)

# Function to fetch full text using DOI
def fetch_full_text_by_doi(doi):
    # Construct the full-text URL using DOI (Europe PMC often redirects to publisher)
    full_text_url = f"https://doi.org/{doi}"
    download_paper(full_text_url)

# Function to download the paper
def download_paper(article_url):
    save_as= article_url.split('/')[-1].split('?')[0]+'.pdf'
    try:
        # Set headers to mimic a browser (sometimes necessary)
        headers = {
            "User-Agent": "Mozilla/5.0"
        }

        # Send request
        response = requests.get(article_url, headers=headers)

        # Check if successful
        if response.status_code == 200:
            with open(save_as, "wb") as f:
                f.write(response.content)
            print(f"PDF downloaded successfully and saved as '{save_as}'")
        else:
            print(f"Failed to download PDF. Status code: {response.status_code}")
    except Exception as e:
        print(f"Error: {e}")

# Query example
query_europe_pmc("Marketing in Pharma or Healthcare", max_results=3)


{'version': '6.9', 'hitCount': 201808, 'nextCursorMark': 'AoIIQJIuzyg1MjIxMjU1Ng==', 'nextPageUrl': 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=Marketing in Pharma or Healthcare&cursorMark=AoIIQJIuzyg1MjIxMjU1Ng==&resultType=core&pageSize=3&format=json', 'request': {'queryString': 'Marketing in Pharma or Healthcare', 'resultType': 'core', 'cursorMark': '*', 'pageSize': 3, 'sort': '', 'synonym': False}, 'resultList': {'result': [{'id': '40056372', 'source': 'MED', 'pmid': '40056372', 'pmcid': 'PMC12006214', 'fullTextIdList': {'fullTextId': ['PMC12006214']}, 'doi': '10.1007/s12325-025-03135-5', 'title': 'Safety of iGlarLixi in Japanese People with Type 2 Diabetes: A Post-marketing Database Study.', 'authorString': 'Kaneto H, Hatanaka M, Morimoto Y, Takahashi Y, Terauchi Y.', 'authorList': {'author': [{'fullName': 'Kaneto H', 'firstName': 'Hideaki', 'lastName': 'Kaneto', 'initials': 'H', 'authorId': {'type': 'ORCID', 'value': '0000-0001-7898-1943'}, 'authorAffiliationDe

In [None]:
https://europepmc.org/articles/40112233

In [5]:
def download_paper(article_url, save_as="paper.pdf"):
    try:
        # Set headers to mimic a browser (sometimes necessary)
        headers = {
            "User-Agent": "Mozilla/5.0"
        }

        # Send request
        response = requests.get(article_url, headers=headers)

        # Check if successful
        if response.status_code == 200:
            with open(save_as, "wb") as f:
                f.write(response.content)
            print(f"PDF downloaded successfully and saved as '{save_as}'")
        else:
            print(f"Failed to download PDF. Status code: {response.status_code}")
    except Exception as e:
        print(f"Error: {e}")

# Example usage
download_paper("https://europepmc.org/articles/PMC11949217?pdf=render", "PMC11949217.pdf")

PDF downloaded successfully and saved as 'PMC11949217.pdf'


In [None]:
"https://europepmc.org/articles/{pmid}?pdf=render"