# Extract insights using LLM 

In [1]:
import pandas as pd 
import re 
import numpy as np 
import json
from ast import literal_eval
pd.set_option('display.max_colwidth', 200)
from time import time

In [2]:
df_filtered = pd.read_csv("../outputs/filtered_articles_df.csv")
df_filtered.shape

(31, 6)

In [3]:
df_filtered.sample(1)

Unnamed: 0,pubmed_id,title,journal,keywords,abstract,llm_class
16,40166174,High-throughput single cell -omics using semi-permeable capsules.,bioRxiv : the preprint server for biology (bioRxiv),,Biological systems are inherently complex and heterogeneous. Deciphering this complexity increasingly relies on high-throughput analytical methods and tools that efficiently probe the cellular ph...,relevant


In [8]:
def write_full_text(pubmed_id,title,abstract):
    text = ""
    text += "PUBMED_ID :" + str(pubmed_id) +"\n"
    text += "Title :" + title + "\n\n"
    text += "Abstract :\n" + abstract + "\n"
    
    return text.strip()

In [9]:
df_filtered["full_text"] = df_filtered.apply(lambda row: write_full_text(row["pubmed_id"],row["title"],row["abstract"]),axis = 1)

In [10]:
df_filtered.sample(1)

Unnamed: 0,pubmed_id,title,journal,keywords,abstract,llm_class,full_text
11,40157909,A novel clinically relevant antagonistic interplay between prolactin and oncogenic YAP-CCN2 pathways as a differentiation therapeutic target in breast cancer.,Cell death & disease (Cell Death Dis),,"Cellular differentiation limits cellular plasticity allowing cells to attain their specialized functional characteristics and phenotypes, whereas loss of differentiation is a hallmark of cancer. ...",relevant,PUBMED_ID :40157909\nTitle :A novel clinically relevant antagonistic interplay between prolactin and oncogenic YAP-CCN2 pathways as a differentiation therapeutic target in breast cancer.\n\nAbstra...


In [11]:
print(df_filtered["full_text"][0])

PUBMED_ID :40207234
Title :The distinctive signature of regulatory CD4 T cells committed in the human thymus.

Abstract :
 Thymically committed regulatory CD4 T cells (tTregs) are essential for immune homeostasis and self- tolerance. We established the human tTreg Expression Signature by comparing genome-wide transcriptomic profiles between tTregs and their conventional counterparts (tTconvs). We further exploited the high sequencing depth of our bulk RNA-seq data to identify a subset of 250 genes significantly expressed in human tTregs and with neglectable expression in tTconvs, defined as below the levels of expression of <i>IL2RA</i>, that we named thymic Treg "private" genes. Notably, pathways related to cell motility, inflammation, and T-cell effector specification were overrepresented within the tTreg private genes. We found that 163 of these genes were significantly less expressed in circulating naïve and memory Tregs when compared to peripheral data generated in parallel. This 

In [12]:
text_list = df_filtered["full_text"].tolist()
len(text_list)

31

In [24]:
prompt = """
You are a biomedical research assistant. Given the article below, extract the following:

1. Diseases mentioned
2. Genes or Proteins discussed
3. Biological Pathways referenced
4. Experimental Methods used
5. A 1–2 sentence summary of the article's key scientific finding

### ARTICLE STARTS ###
{article}
### ARTICLE ENDS ###

Respond in the following JSON format:

{{
  "pubmed_id": "",
  "diseases": [...],
  "genes_proteins": [...],
  "pathways": [...],
  "methods": [...],
  "summary": ""
}}

Do not add any additional texts or explanation.
""".strip()

In [25]:
prompt_batched = [prompt.format(article = text) for text in text_list]
len(prompt_batched)

31

### Loading LLM

In [16]:
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

In [None]:
from dotenv import load_dotenv
groq_api_key = os.getenv("GROQ_API_KEY")

In [17]:
llm = ChatGroq(
    temperature = 0,
    groq_api_key = groq_api_key,
    model_name = "llama-3.1-8b-instant"
)

In [30]:
res = llm.invoke(prompt_batched[0])

In [31]:
print(prompt_batched[0])

You are a biomedical research assistant. Given the article below, extract the following:

1. Diseases mentioned
2. Genes or Proteins discussed
3. Biological Pathways referenced
4. Experimental Methods used
5. A 1–2 sentence summary of the article's key scientific finding

### ARTICLE STARTS ###
PUBMED_ID :40207234
Title :The distinctive signature of regulatory CD4 T cells committed in the human thymus.

Abstract :
 Thymically committed regulatory CD4 T cells (tTregs) are essential for immune homeostasis and self- tolerance. We established the human tTreg Expression Signature by comparing genome-wide transcriptomic profiles between tTregs and their conventional counterparts (tTconvs). We further exploited the high sequencing depth of our bulk RNA-seq data to identify a subset of 250 genes significantly expressed in human tTregs and with neglectable expression in tTconvs, defined as below the levels of expression of <i>IL2RA</i>, that we named thymic Treg "private" genes. Notably, pathwa

In [32]:
print(res.content)

{
  "pubmed_id": "40207234",
  "diseases": ["immune homeostasis", "self-tolerance"],
  "genes_proteins": ["IL2RA"],
  "pathways": ["cell motility", "inflammation", "T-cell effector specification"],
  "methods": ["genome-wide transcriptomic profiles", "bulk RNA-seq data"],
  "summary": "The study established a unique expression signature for regulatory CD4 T cells committed in the human thymus, identifying a subset of 'private' genes with higher activity in the thymus compared to peripheral compartments."
}


In [34]:
from time import time 

In [35]:
li_out = []
start_time = time()
for p in prompt_batched:
    out = llm.invoke(p)
    if isinstance(out.content,str):
        output = literal_eval(out.content)
        
    li_out.append(output)
end_time = time()   
print(f"Length of output: {len(li_out)}, Time Taken : {end_time - start_time} seconds")

Length of output: 31, Time Taken : 175.707022190094 seconds


In [38]:
li_out[:3]

[{'pubmed_id': '40207234',
  'diseases': ['immune homeostasis', 'self-tolerance'],
  'genes_proteins': ['IL2RA'],
  'pathways': ['cell motility',
   'inflammation',
   'T-cell effector specification'],
  'methods': ['genome-wide transcriptomic profiles', 'bulk RNA-seq data'],
  'summary': "The study established a unique expression signature for regulatory CD4 T cells committed in the human thymus, identifying a subset of 'private' genes with higher activity in the thymus compared to peripheral compartments."},
 {'pubmed_id': '40223063',
  'diseases': ['Acute Myeloid Leukemia (AML)'],
  'genes_proteins': ['FGFBP2',
   'GZMB',
   'GZMH',
   'IKZF3',
   'IL2RB',
   'KLRB1',
   'KLRC2',
   'RHOF',
   'RUNX3',
   'STAT4'],
  'pathways': ['Immune microenvironment', 'NKT cell pathway'],
  'methods': ['Single-sample gene set enrichment analysis (ssGSEA)',
   'Weighted gene co-expression network analysis (WGCNA)',
   'Cox Lasso regression model',
   'Cox random forest',
   'Kaplan-Meier surviva

In [39]:
df_insights = pd.DataFrame(li_out)

In [41]:
df_insights.shape

(31, 6)

In [42]:
df_insights.sample(2)

Unnamed: 0,pubmed_id,diseases,genes_proteins,pathways,methods,summary
16,40166174,"[Hematopoietic disorders, Acute myeloid leukemia (AML)]",[],[],"[Digital PCR, Genome sequencing, Single-cell RNA-sequencing (scRNA-Seq), FACS-based isolation, Semi-permeable capsules (SPCs)]","The article presents a versatile technology using semi-permeable capsules for high-throughput single-cell -omics assays, demonstrating its application in uncovering changes in transcriptomes assoc..."
10,39908652,[Neuroblastoma],"[Midkine (MDK), Macrophage Migration Inhibitory Factor (MIF), GPC2, B7-H3]",[Immunosuppressive tumor microenvironments (TME)],"[Single-cell RNA sequencing (scRNA-seq), Bulk-RNA sequencing, Mass-spectrometry, PROTAC technology, In vitro and in vivo functional assays]",Blocking MIF secretion enhances CAR T-cell efficacy against neuroblastoma by significantly improving activation of CAR T-cells.


In [44]:
df_insights.pubmed_id.nunique(),df_filtered.pubmed_id.nunique()

(31, 31)

### Merge filtered df with insights to create a metadata file

In [49]:
type(df_filtered["pubmed_id"][0]),type(df_insights["pubmed_id"][0]),

(numpy.int64, str)

In [50]:
df_filtered["pubmed_id"] = df_filtered["pubmed_id"].apply(str)

In [51]:
df_insights_meta = pd.merge(df_filtered,df_insights,on='pubmed_id', how = "inner")

In [54]:
df_insights_meta.shape

(31, 12)

In [55]:
df_insights_meta.isna().sum()

pubmed_id         0
title             0
journal           0
keywords          0
abstract          0
llm_class         0
full_text         0
diseases          0
genes_proteins    0
pathways          0
methods           0
summary           0
dtype: int64

In [56]:
df_insights_meta.to_csv("../outputs/insights_meta.csv", index = False)
df_insights.to_csv("../outputs/insights.csv", index = False)

In [57]:
df_insights.to_json("../outputs/extracted_insights.json", orient= "records")

In [59]:
with open("../outputs/extracted_insights.json",'r') as f:
    data = json.load(f)

In [61]:
data[:3]

[{'pubmed_id': '40207234',
  'diseases': ['immune homeostasis', 'self-tolerance'],
  'genes_proteins': ['IL2RA'],
  'pathways': ['cell motility',
   'inflammation',
   'T-cell effector specification'],
  'methods': ['genome-wide transcriptomic profiles', 'bulk RNA-seq data'],
  'summary': "The study established a unique expression signature for regulatory CD4 T cells committed in the human thymus, identifying a subset of 'private' genes with higher activity in the thymus compared to peripheral compartments."},
 {'pubmed_id': '40223063',
  'diseases': ['Acute Myeloid Leukemia (AML)'],
  'genes_proteins': ['FGFBP2',
   'GZMB',
   'GZMH',
   'IKZF3',
   'IL2RB',
   'KLRB1',
   'KLRC2',
   'RHOF',
   'RUNX3',
   'STAT4'],
  'pathways': ['Immune microenvironment', 'NKT cell pathway'],
  'methods': ['Single-sample gene set enrichment analysis (ssGSEA)',
   'Weighted gene co-expression network analysis (WGCNA)',
   'Cox Lasso regression model',
   'Cox random forest',
   'Kaplan-Meier surviva

In [62]:
df_insights_meta[df_insights_meta["pubmed_id"] == '40223063']

Unnamed: 0,pubmed_id,title,journal,keywords,abstract,llm_class,full_text,diseases,genes_proteins,pathways,methods,summary
1,40223063,Prognostic value of natural killer T cell related genes in acute myeloid leukemia.,Cancer cell international (Cancer Cell Int),"Acute myeloid leukemia, NKT cell, Prognosis, WGCNA",Acute myeloid leukemia (AML) is a hematological malignancy characterized by complex immune microenvironment. This study aims to identify immune-related prognostic biomarkers in AML. Multiple pub...,relevant,PUBMED_ID :40223063\nTitle :Prognostic value of natural killer T cell related genes in acute myeloid leukemia.\n\nAbstract :\n Acute myeloid leukemia (AML) is a hematological malignancy characteri...,[Acute Myeloid Leukemia (AML)],"[FGFBP2, GZMB, GZMH, IKZF3, IL2RB, KLRB1, KLRC2, RHOF, RUNX3, STAT4]","[Immune microenvironment, NKT cell pathway]","[Single-sample gene set enrichment analysis (ssGSEA), Weighted gene co-expression network analysis (WGCNA), Cox Lasso regression model, Cox random forest, Kaplan-Meier survival analyses, RT-qPCR, ...","This study identified key prognostic genes in Acute Myeloid Leukemia (AML) and highlighted the critical role of NKT cells in AML pathogenesis, providing new insights and potential biomarkers for u..."
