<a href="https://colab.research.google.com/github/Judykimani1/Judykimani1/blob/main/Judy_kimani.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
# STEP 1: Install required libraries
!pip install llama-index llama-index-embeddings-huggingface llama-index-llms-huggingface bitsandbytes --quiet

In [23]:
# STEP 2: Import libraries
import os
import json
import pandas as pd
from tqdm import tqdm
from llama_index.core import Document, VectorStoreIndex, StorageContext, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core import StorageContext, load_index_from_storage



In [24]:
# prompt: how to mount my dataset from google drive

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
# Load metadata
df = pd.read_csv("/content/drive/MyDrive/cord19dataset/metadata.csv", low_memory=False)

# Fill missing titles and abstracts with empty strings
df["title"] = df["title"].fillna("").str.lower()
df["abstract"] = df["abstract"].fillna("").str.lower()

# Define keywords
smoking_keywords = ["smoking", "tobacco", "nicotine", "cigarette", "vaping"]
covid_keywords = ["covid", "covid-19", "sars-cov-2", "coronavirus", "novel coronavirus"]

# Filter for papers containing both smoking-related and COVID-19-related terms
is_smoking_related = df["title"].str.contains("|".join(smoking_keywords)) | df["abstract"].str.contains("|".join(smoking_keywords))
is_covid_related = df["title"].str.contains("|".join(covid_keywords)) | df["abstract"].str.contains("|".join(covid_keywords))

filtered_df = df[is_smoking_related & is_covid_related]

# Save results
filtered_df.to_csv("filtered_metadata_smoking_covid.csv", index=False)

# Show how many papers matched
print(f"Found {len(filtered_df)} papers related to smoking and COVID-19.")

Found 63 papers related to smoking and COVID-19.


In [26]:
print(df.columns.tolist())


['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id', 'license', 'abstract', 'publish_time', 'authors', 'journal', 'Microsoft Academic Paper ID', 'WHO #Covidence', 'has_pdf_parse', 'has_pmc_xml_parse', 'full_text_file', 'url']


In [27]:
df_filtered = pd.read_csv("filtered_metadata_smoking_covid.csv")


In [28]:
# STEP 3: Load filtered metadata
df_filtered = pd.read_csv("filtered_metadata_smoking_covid.csv")  # Your keyword-filtered metadata
df_filtered = df_filtered[df_filtered['full_text_file'].notnull()]

In [29]:
# STEP 4: Set up the LLM agent
llm = HuggingFaceLLM(
    model_name="colesmcintosh/Llama-3.2-1B-Instruct-Mango",
    tokenizer_name="colesmcintosh/Llama-3.2-1B-Instruct-Mango",
    context_window=2048,
    max_new_tokens=256,
    device_map="cuda",
    generate_kwargs={"temperature": 0.7, "do_sample": True},
)
Settings.llm = llm

# Use index_storage_dir instead of persist_dir
storage_context = StorageContext.from_defaults(persist_dir=index_storage_dir)
index = load_index_from_storage(storage_context)

chat_engine = index.as_chat_engine(
    chat_mode="context",
    memory=ChatMemoryBuffer.from_defaults(token_limit=3000),
    system_prompt="You are a helpful medical assistant. You only answer based on CORD-19 papers related to COVID-19 and smoking."
)



In [30]:
# STEP 5: Create the vector store index
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2", device="cuda")
Settings.embed_model = embed_model
Settings.llm = None
documents = [Document(text=row["abstract"]) for _, row in filtered_df.iterrows()]

index_storage_dir = "/content/drive/MyDrive/cord19dataset/cord19_index"
if not os.path.exists(index_storage_dir):
    index = VectorStoreIndex.from_documents(documents)
    index.storage_context.persist(persist_dir=index_storage_dir)
else:
    print("Index already exists.")

LLM is explicitly disabled. Using MockLLM.
Index already exists.


In [31]:
# STEP 6: Query loop
while True:
    query = input("You: ")
    if query.lower() == "quit":
        break
    response = chat_engine.chat(query)
    print(f"Agent: {response.response}")

You: what is smoking
Agent: Smoking refers to the act of burning tobacco or tobacco products, such as cigarettes, cigars, and pipes. Smoking can also refer to the act of inhaling the smoke from these products. Smoking is a major risk factor for many diseases, including lung cancer, heart disease, and other respiratory problems.
You: difference between nicotine and tobacco
Agent: Nicotine is the addictive substance found in tobacco products, while tobacco is the plant from which it is derived. Tobacco refers to the dried leaves of the tobacco plant, while nicotine is the chemical compound that is present in tobacco.


KeyboardInterrupt: Interrupted by user