In [None]:
#pip install langchain langchain_chroma langchain_openai langchain_core biopython fuzzywuzzy

In [1]:
import pandas as pd
from Bio import Entrez
import json
import requests
import xml.etree.ElementTree as ET
from fuzzywuzzy import process
from fuzzywuzzy import fuzz
import os

from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
import time

from langchain_core.vectorstores import VectorStoreRetriever
from typing import List, Tuple
from langchain_core.runnables import chain
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA



In [2]:
from google.colab import userdata
os.environ['OPENAI_API_KEY']= userdata.get('OPENAI_API_KEY')

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
os.chdir('/content/drive/MyDrive/Colab Notebooks/Youtube claims project')
!ls

 PubMed_API.ipynb   pubmed_temp   scimagojr_2023.csv  'Supply Chain Risk.ipynb'


In [5]:
topics= ["Green tea", "liver", "cancer"]
claim= "Green tea cures liver cancer"
queries= []

Entrez.email= "nithinpradeep38@gmail.com"
# Define date range
date_range = '("2010/03/01"[Date - Create] : "2024/07/31"[Date - Create])'
if topics:
    topic_queries = ['{}[Title/Abstract]'.format(topic) for topic in topics]
    queries.append('(' + ' AND '.join(topic_queries) + ')')

full_query = ' AND '.join(queries) + ' AND ' + date_range

In [6]:


def fetch_pmc_conclusions(pmcid):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
    efetch_url = f"{base_url}efetch.fcgi?db=pmc&id={pmcid}&retmode=xml"
    response = requests.get(efetch_url)

    if response.status_code != 200:
        print(f"Error fetching article: HTTP {response.status_code}")
        return None

    root = ET.fromstring(response.content)

    def extract_text(element):
        text = element.text or ""
        for child in element:
            if child.tag == 'xref':
                text += child.text or ""
            else:
                text += extract_text(child)
            text += child.tail or ""
        return text

    # Look for 'Conclusions' section
    conclusions = ""
    for section in root.findall(".//sec"):
        section_title = section.find("title")
        if section_title is not None and section_title.text and "conclusion" in section_title.text.lower():
            for p in section.findall(".//p"):
                conclusions += extract_text(p) + "\n"
            break

    if not conclusions:
        conclusions = "Conclusions section not found in the article."

    return conclusions.strip()

In [7]:
# Search PubMed for relevant records
handle = Entrez.esearch(db='pubmed', retmax=11, term=full_query)
record = Entrez.read(handle)
id_list = record['IdList']

# DataFrame to store the extracted data
df = pd.DataFrame(columns=['PMID','Title', 'Abstract', 'Journal', 'URL', 'PMCID', 'Conclusions'])

In [8]:
# Fetch information for each record in the id_list
for pmid in id_list:
    handle = Entrez.efetch(db='pubmed', id=pmid, retmode='xml')
    records = Entrez.read(handle)

    # Fetch PMCID
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
    elink_url = f"{base_url}elink.fcgi?dbfrom=pubmed&db=pmc&id={pmid}&retmode=xml"
    pmc_response = requests.get(elink_url)
    pmcid = None
    conclusions= "No PMC article available"
    if pmc_response.status_code == 200:
        root = ET.fromstring(pmc_response.content)
        pmcid_element = root.find(".//LinkSetDb/Link/Id")
        if pmcid_element is not None:
            pmcid = f"PMC{pmcid_element.text}"
            conclusions= fetch_pmc_conclusions(pmcid)
    # Process each PubMed article in the response
    for record in records['PubmedArticle']:
        # Print the record in a formatted JSON style
        #print(json.dumps(record, indent=4, default=str))  # default=str handles types JSON can't serialize like datetime
        title = record['MedlineCitation']['Article']['ArticleTitle']
        abstract = ' '.join(record['MedlineCitation']['Article']['Abstract']['AbstractText']) if 'Abstract' in record['MedlineCitation']['Article'] and 'AbstractText' in record['MedlineCitation']['Article']['Abstract'] else ''
        journal = record['MedlineCitation']['Article']['Journal']['Title']
        url = f"https://www.ncbi.nlm.nih.gov/pubmed/{pmid}"

        new_row = pd.DataFrame({
            'PMID': [pmid],
            'Title': [title],
            'Abstract': [abstract],
            'Journal': [journal],
            'URL': [url],
            'PMCID' : [pmcid],
            'Conclusions': [conclusions]
        })

        df = pd.concat([df, new_row], ignore_index=True)
        time.sleep(2)

In [9]:
df

Unnamed: 0,PMID,Title,Abstract,Journal,URL,PMCID,Conclusions
0,38878905,Clinical effects of Hibiscus sabdariffa Linn. ...,Obesity is associated with many chronic non-co...,Complementary therapies in medicine,https://www.ncbi.nlm.nih.gov/pubmed/38878905,,No PMC article available
1,38810332,Serum untargeted metabolomics analysis of the ...,Pu-erh tea belongs to the six tea categories o...,Journal of pharmaceutical and biomedical analysis,https://www.ncbi.nlm.nih.gov/pubmed/38810332,,No PMC article available
2,38703282,A comprehensive review of phytoconstituents in...,Primary liver cancer is a type of cancer that ...,"Medical oncology (Northwood, London, England)",https://www.ncbi.nlm.nih.gov/pubmed/38703282,,No PMC article available
3,38675480,The Epidemiology of Newly Recognized Causes of...,The incidence and prevalence of drug-induced l...,"Pharmaceuticals (Basel, Switzerland)",https://www.ncbi.nlm.nih.gov/pubmed/38675480,PMC11053599,Conclusions section not found in the article.
4,38319384,Association between green tea intake and diges...,Previous observational studies have shown that...,European journal of nutrition,https://www.ncbi.nlm.nih.gov/pubmed/38319384,,No PMC article available
5,38161126,"Association between socio-demographic factors,...",Hypertension is a global health issue with inc...,"Nutrition, metabolism, and cardiovascular dise...",https://www.ncbi.nlm.nih.gov/pubmed/38161126,PMC11290630,This study identifies key factors affecting dy...
6,38031409,The effects of green tea extract supplementati...,Research indicates that green tea extract (GTE...,The British journal of nutrition,https://www.ncbi.nlm.nih.gov/pubmed/38031409,PMC11288358,This study suggests that JQP might reduce infl...
7,37724674,Medicinal Plant-Rich Diet: A Potential Therape...,Colorectal cancer is estimated to become the l...,Cardiovascular & hematological agents in medic...,https://www.ncbi.nlm.nih.gov/pubmed/37724674,,No PMC article available
8,37521645,Preparation and <i>in Vitro</i> Evaluation of ...,Tamoxifen is the drug of choice as hormonal th...,ACS omega,https://www.ncbi.nlm.nih.gov/pubmed/37521645,PMC10372931,"TMX-AG-INPs were produced\nin this study, and ..."
9,37066273,Inhibition of miR-214 expression by small mole...,"Predominantly, head and neck cancer (HNC) is c...",bioRxiv : the preprint server for biology,https://www.ncbi.nlm.nih.gov/pubmed/37066273,PMC10104035,Conclusions section not found in the article.


In [10]:
df1= pd.read_csv('scimagojr_2023.csv')
df1.head()

Unnamed: 0,Rank,Journal
0,1,Ca-A Cancer Journal for Clinicians
1,2,Foundations and Trends in Machine Learning
2,3,Nature Reviews Molecular Cell Biology
3,4,Quarterly Journal of Economics
4,5,Nature Reviews Cancer


In [11]:
def get_best_match(row, choices, scorer):
    best_match, score = process.extractOne(row, choices, scorer=scorer)
    return pd.Series([best_match, score])


# Create a list of names from df_b for matching
choices = df1['Journal'].tolist()

# Apply fuzzy matching to df_a
df[['best_match', 'score']] = df['Journal'].apply(lambda x: get_best_match(x, choices, scorer=fuzz.ratio))

# Merge based on the best match
merged_df = pd.merge(df, df1, left_on='best_match', right_on='Journal', how='left')

merged_df.drop(columns= ['score', 'best_match', 'Journal_y','Journal_x', 'PMCID'], inplace=True)

# Calculate the minimum and maximum values of the rank column
min_rank = merged_df['Rank'].min()
max_rank = merged_df['Rank'].max()

# Apply min-max normalization,
merged_df['normalized_rank'] = 1- (merged_df['Rank'] - min_rank) / (max_rank - min_rank)

merged_df.drop(columns= ['Rank'], inplace= True)

merged_df.head()

Unnamed: 0,PMID,Title,Abstract,URL,Conclusions,normalized_rank
0,38878905,Clinical effects of Hibiscus sabdariffa Linn. ...,Obesity is associated with many chronic non-co...,https://www.ncbi.nlm.nih.gov/pubmed/38878905,No PMC article available,0.876268
1,38810332,Serum untargeted metabolomics analysis of the ...,Pu-erh tea belongs to the six tea categories o...,https://www.ncbi.nlm.nih.gov/pubmed/38810332,No PMC article available,0.67813
2,38703282,A comprehensive review of phytoconstituents in...,Primary liver cancer is a type of cancer that ...,https://www.ncbi.nlm.nih.gov/pubmed/38703282,No PMC article available,0.0
3,38675480,The Epidemiology of Newly Recognized Causes of...,The incidence and prevalence of drug-induced l...,https://www.ncbi.nlm.nih.gov/pubmed/38675480,Conclusions section not found in the article.,0.264296
4,38319384,Association between green tea intake and diges...,Previous observational studies have shown that...,https://www.ncbi.nlm.nih.gov/pubmed/38319384,No PMC article available,1.0


In [12]:
# Create Documents with rank in metadata
documents = []
for index, row in merged_df.iterrows():
    content = f"Title: {row['Title']}\n\nAbstract: {row['Abstract']}\n\nConclusions: {row['Conclusions']}"
    doc = Document(
        page_content=content,
        metadata={
            "PMID": row['PMID'],
            "URL": row['URL'],
            "normalized_rank": row['normalized_rank']  # Include the rank in metadata
        }
    )
    documents.append(doc)

In [13]:
# openai embedding
openai_embed_model = OpenAIEmbeddings(model='text-embedding-3-small')

# create vector DB of docs and embeddings
vectorstore = Chroma.from_documents(documents=documents,
                                  collection_name='pubmed_temp',
                                  embedding=openai_embed_model,
                                  collection_metadata={"hnsw:space": "cosine"},
                                  persist_directory="./pubmed_temp")

In [25]:
vectorstore= Chroma(collection_name='pubmed_temp', embedding_function=openai_embed_model, persist_directory="./pubmed_temp")

class CustomRetriever(VectorStoreRetriever):
  vectorstore: Chroma
  topk: int = 5
  similarity_weight: float = 0.8
  search_type: str = "similarity"
  search_kwargs: dict = Field(default_factory=dict)


  def _get_relevant_documents(self, query: str) -> List[Document]:
      # Get similarity conversion function
      cosine_sim = self.vectorstore._select_relevance_score_fn()

      # Get top k documents with lowest cosine distance
      docs, scores = zip(*self.vectorstore.similarity_search_with_score(query, k=self.topk))
      scores= [cosine_sim(score) for score in scores]
      min_score= min(scores)
      max_score= max(scores)
      normalized_scores= [(score-min_score)/(max_score-min_score) for score in scores]
      # Calculate final score considering both similarity and rank
      for doc, score, normalized_score in zip(docs, scores, normalized_scores):
          # Normalize the cosine similarity scores
          doc.metadata['normalized_score'] = normalized_score
          rank = doc.metadata['normalized_rank']
          final_score = self.similarity_weight * normalized_score + (1 - self.similarity_weight) * rank
          doc.metadata['final_score'] = final_score

      # Sort documents by final score
      sorted_docs = sorted(docs, key=lambda x: x.metadata['final_score'], reverse=True)

      return sorted_docs[:self.topk]

custom_retriever = CustomRetriever(vectorstore=vectorstore)

In [43]:

# Define your desired data structure - like a python data class.
scientific_validation_summary_task="""Provide scientific Validation summary in less than 25 words:**
   - Conduct a thorough review of the retrieved context for studies related to the provided claim.
   - Prioritize peer-reviewed journals, with special emphasis on systematic reviews, cohort studies, meta-analyses and randomized controlled trials (RCTs), if available in the context as they are high quality scientific evidence.
   - Do not consider case reports, case series, opinion pieces or observational studies and do not make up research papers as they are low quality evidence.
   - Evaluate the strength of evidence supporting the claim, as well as any contradictory or inconclusive findings.
   - If no relevant content available in the provided only then conduct a thorough review of reputable medical research databases like PubMed.
   """

classification_task= """Based on the context used to summarize in the above task, classify the claim as one of the following:
**Scientific**: Supported by substantial, high-quality scientific evidence.
**Pseudo-science/Inconclusive**: Not supported by strong and credible evidence OR supported only by inconclusive scientific evidence, or contradicted by substantial evidence.
**Partially correct**: Supported by substantial scientific evidence but with significant caveats."""

research_summary_task= """Research Summary in less than 25 words:Provide a concise summary of the research findings that support your classification."""

contradictory_claims_task= """Contradictory Claims in less than 25 words: Identify if there are any scientifically supported evidence that contradicts the original claim or pose any health risks.
If such evidence is found, explain why the contradicting claim is scientifically valid."""


class QueryResponse(BaseModel):
    scientific_validation_summary: str = Field(description=scientific_validation_summary_task)
    classification: str = Field(description=classification_task)
    research_summary: str = Field(description=research_summary_task)
    contradictory_claims: str = Field(description=contradictory_claims_task)


# Set up a parser + inject instructions into the prompt template.
parser = JsonOutputParser(pydantic_object=QueryResponse)

gpt_prompt_txt= """
You are a medical researcher.Given the following health-related claim, generate the response based on the tasks specified in the following instructions:
claim= {claim}
context= {context}
Format Instructions: {format_instructions}
"""
gpt_prompt = PromptTemplate(
    template=gpt_prompt_txt,
    input_variables=["claim"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

In [44]:
chatgpt = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
qa_rag_chain = RetrievalQA.from_chain_type(llm=chatgpt,
                                           chain_type="stuff",
                                           retriever=custom_retriever,
                                           chain_type_kwargs={"prompt": gpt_prompt})

In [52]:
from langchain_core.runnables import RunnablePassthrough

def format_docs(documents):
    return "\n\n".join(doc.page_content for doc in documents)

qa_rag_chain = (
    {
        "context": (custom_retriever
                      |
                    format_docs),
        "claim": RunnablePassthrough()

    }
      |
    gpt_prompt
      |
    chatgpt
)

In [59]:
claim= "Green tea cures liver cancer"
result= qa_rag_chain.invoke(claim)

In [60]:
result

AIMessage(content='```json\n{\n  "scientific_validation_summary": "Green tea catechins show potential anticancer effects, but no conclusive evidence supports green tea as a cure for liver cancer.",\n  "classification": "Pseudo-science/Inconclusive",\n  "research_summary": "Studies indicate green tea catechins may inhibit cancer cell growth, but no causal link to curing liver cancer is established.",\n  "contradictory_claims": "No significant association found between green tea intake and liver cancer risk in large population studies."\n}\n```', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 109, 'prompt_tokens': 2685, 'total_tokens': 2794}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_48196bc67a', 'finish_reason': 'stop', 'logprobs': None}, id='run-aa930728-08d6-4275-9e89-f9408854f203-0', usage_metadata={'input_tokens': 2685, 'output_tokens': 109, 'total_tokens': 2794})

Get Youtube URL--> Check whether health domain or not-->  Summarization--> Claim extraction--> Key word extraction--> Pub-med article, abstract, conclusions etc.,--> Convert to LangChain document format--> Store in vector database--> Retriever--> RAG Chain--> Output