In [25]:
import pandas as pd 
import re 
import numpy as np 
import json
from ast import literal_eval
pd.set_option('display.max_colwidth', 200)
from time import time

In [10]:
from langchain_groq import ChatGroq
from langchain_core.prompts import PromptTemplate

In [2]:
df_parsed = pd.read_csv("../outputs/parsed_md_files.csv")
df_parsed.shape

(50, 5)

In [4]:
df_parsed.sample(4)

Unnamed: 0,pubmed_id,title,journal,keywords,abstract
46,40095971,Electroporation-Based CRISPR-Cas9-Mediated Gene Knockout in THP-1 Cells and Single-Cell Clone Isolation.,Journal of visualized experiments : JoVE (J Vis Exp),,"The human acute monocytic leukemia (AML) THP-1 cell line is widely used as a model to study the functions of human monocyte-derived macrophages, including their interplay with significant human p..."
14,40201592,Modulation of cell fate by shock wave therapy in ischaemic heart disease.,European heart journal open (Eur Heart J Open),"Cardiac surgery, Heart regeneration, Shockwave therapy, Therapeutic transdifferentiation","Cardiac shockwave therapy (SWT) improves left ventricular (LV) function in patients with ischaemic cardiomyopathy. Shockwave therapy activates Toll-like receptor 3 (TLR3), a receptor-inducing chr..."
38,40214805,The effect of TIGIT and PD1 expression on T cell function and prognosis in adult patients with acute myeloid leukemia at diagnosis.,"Cancer immunology, immunotherapy : CII (Cancer Immunol Immunother)","Newly-diagnosed acute myeloid leukemia, PD1, Relapse-free survival, TIGIT, scRNA-seq","T cell immunoreceptor with immunoglobulin and ITIM domain (TIGIT) is a recently-identified immune checkpoint molecule, and no study ever explores the prognostic significance of TIGIT on bone marr..."
20,40186195,CXCL12/CXCR4 axis mediates CD8 <sup>+</sup> T cell overactivation in the progression of viral myocarditis.,Journal of translational medicine (J Transl Med),"CD8+T cell, CXCR4, Myocarditis, Single-cell RNA sequencing","Myocarditis is a common inflammatory heart disease in children and young adults, with fulminant myocarditis (FM) being the most severe form due to its rapid onset and high mortality rate. However..."


In [13]:
llm = ChatGroq(
    temperature = 0,
    groq_api_key = "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",
    model_name = "llama-3.1-8b-instant"
)

In [11]:
prompt = """
you are a classification assistant that reads an article and classifies if it as relevant "cancer" or "immunology" related articles.

Classify the below paragraph as "relevant" if it is related to cancer or immunology, otherwise classify it as "irrelevant".
## Paragraph: 
{paragraph}

Return the output  as a json like {{"class":" "}}.
""".strip()

In [12]:
abstracts = df_parsed.abstract.tolist()
len(abstracts)

50

In [14]:
prompt_batched = [prompt.format(paragraph = text) for text in abstracts]

In [26]:
li_out = []
start_time = time()
for p in prompt_batched:
    out = llm.invoke(p)
    output = literal_eval(out.content)
    li_out.append(output)

end_time = time()
print(f"Time_taken : {end_time - start_time}")

Time_taken : 185.9268500804901


In [27]:
len(li_out)

50

In [31]:
cls = []
for item in li_out:
    if isinstance(item,str):
        item = literal_eval(item)
        
    cls.append(item['class'])

In [33]:
len(cls)

50

In [34]:
df_parsed["llm_class"] = cls

In [37]:
df_parsed[df_parsed.llm_class == "relevant"].shape[0]

31

In [39]:
df_parsed.sample(7)

Unnamed: 0,pubmed_id,title,journal,keywords,abstract,llm_class
35,40121237,MCL‑1 safeguards activated hair follicle stem cells to enable adult hair regeneration.,Nature communications (Nat Commun),,"Hair follicles cycle through expansion, regression and quiescence. To investigate the role of MCL‑1, a BCL‑2 family protein with anti‑apoptotic and apoptosis‑unrelated functions, we delete Mcl‑1 ...",relevant
36,40240239,Cystic Fibrosis-related neurodegenerative disease associated with tauopathy and cognitive decline in aged CF mice.,Journal of cystic fibrosis : official journal of the European Cystic Fibrosis Society (J Cyst Fibros),"Cognition, Cystic fibrosis, Dementia, Tauopathy","Highly effective modulator therapies (HEMT) are increasing the lifespan for many people with cystic fibrosis (pwCF), making it necessary to identify and understand CF specific age-related consequ...",irrelevant
11,40240564,"Characterization and genome analysis of the novel virulent Burkholderia phage Bm1, which is active against pan-drug-resistant Burkholderia multivorans.",Archives of virology (Arch Virol),,"The escalating challenges of antibiotic resistance in bacterial pathogens have necessitated the exploration of alternative therapeutic strategies. Among these, bacteriophage therapy has regained ...",irrelevant
10,40230848,The role of transketolase in the immunotherapy and prognosis of hepatocellular carcinoma: a multi-omics approach.,Frontiers in immunology (Front Immunol),"Hep-G2, TKT, hepatocellular carcinoma, pentose phosphate pathway, transketolase",To explore the role of transketolase (TKT) in the immunotherapy and prognosis of hepatocellular carcinoma (HCC). TKT expression across various cancers and its associations with tumor immunity an...,relevant
3,40202825,From Fat Providers to Cancer Therapy: Adipocytes as Unexpected Allies.,Cancer research (Cancer Res),,Adipocytes from white adipose tissue support cancer progression by supplying fatty acids to tumor cells while cold-activated brown adipose tissue has been shown to inhibit tumor growth by disrupt...,relevant
0,40207234,The distinctive signature of regulatory CD4 T cells committed in the human thymus.,Frontiers in immunology (Front Immunol),"CD4 T cells, FOXP3, RNA-seq, human T-cell development, human thymus, regulatory T cells",Thymically committed regulatory CD4 T cells (tTregs) are essential for immune homeostasis and self- tolerance. We established the human tTreg Expression Signature by comparing genome-wide transcr...,relevant
14,40201592,Modulation of cell fate by shock wave therapy in ischaemic heart disease.,European heart journal open (Eur Heart J Open),"Cardiac surgery, Heart regeneration, Shockwave therapy, Therapeutic transdifferentiation","Cardiac shockwave therapy (SWT) improves left ventricular (LV) function in patients with ischaemic cardiomyopathy. Shockwave therapy activates Toll-like receptor 3 (TLR3), a receptor-inducing chr...",irrelevant


In [40]:
df_parsed.isna().sum()

pubmed_id     0
title         0
journal       0
keywords     11
abstract      0
llm_class     0
dtype: int64

In [38]:
df_parsed.to_csv("../outputs/llm_classified_dataset.csv", index = False)