------------------

In [None]:
!pip install langchain langchain-community jq langchain-huggingface faiss-cpu

Collecting langchain-community
  Downloading langchain_community-0.3.9-py3-none-any.whl.metadata (2.9 kB)
Collecting jq
  Downloading jq-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.6.1-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.23.1-

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import JSONLoader
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from transformers import AutoModel, AutoTokenizer
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import gc
import os
import time
import psutil
import pickle
import torch
import json

In [6]:
# Define file paths
json_file = "/content/drive/MyDrive/cleaned_wiki_articles.json"
output_pickle = "/content/drive/MyDrive/split_documents.pkl"

# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)

# Load JSON data
with open(json_file, 'r') as f:
    data = json.load(f)

# Initialize storage for split documents
split_documents = []

# Process topics and chunk the documents
print("Processing and chunking documents...")
for topic, articles in tqdm(data.items(), desc="Processing topics"):
    for article in tqdm(articles, desc=f"Processing topic: {topic}"):
        # Extract relevant fields
        title = article.get("title", "Unknown Title")
        url = article.get("url", "No URL")
        summary = article.get("summary", "")
        metadata = {
            "topic": topic,
            "title": title,
            "url": url,
        }

        # Chunk the text
        chunks = text_splitter.split_text(summary)
        split_documents.extend([
            Document(
                page_content=chunk,
                metadata=metadata
            ) for chunk in chunks
        ])

    # Free memory after each topic
    gc.collect()

# Save split documents to a pickle file
with open(output_pickle, 'wb') as f:
    pickle.dump(split_documents, f)

print(f"Total chunks created: {len(split_documents)}")
print(f"Chunked documents saved to: {output_pickle}")

Processing and chunking documents...


Processing topics:   0%|          | 0/10 [00:00<?, ?it/s]
Processing topic: Health:   0%|          | 0/5559 [00:00<?, ?it/s][A
Processing topic: Health:   0%|          | 19/5559 [00:00<00:29, 188.77it/s][A
Processing topic: Health:   1%|          | 38/5559 [00:00<00:30, 182.12it/s][A
Processing topic: Health:   1%|          | 57/5559 [00:00<00:31, 174.62it/s][A
Processing topic: Health:   1%|▏         | 75/5559 [00:00<00:31, 174.29it/s][A
Processing topic: Health:   2%|▏         | 94/5559 [00:00<00:31, 173.16it/s][A
Processing topic: Health:   2%|▏         | 119/5559 [00:00<00:28, 193.10it/s][A
Processing topic: Health:   3%|▎         | 146/5559 [00:00<00:25, 212.34it/s][A
Processing topic: Health:   3%|▎         | 168/5559 [00:00<00:25, 213.28it/s][A
Processing topic: Health:   3%|▎         | 190/5559 [00:01<00:36, 148.47it/s][A
Processing topic: Health:   4%|▍         | 222/5559 [00:01<00:28, 185.25it/s][A
Processing topic: Health:   4%|▍         | 244/5559 [00:01<00:30, 1

Total chunks created: 3437657
Chunked documents saved to: /content/drive/MyDrive/split_documents.pkl


In [8]:
# Function to monitor memory usage
def monitor_memory():
    process = psutil.Process(os.getpid())
    mem = process.memory_info().rss / 1024 ** 2  # Convert to MB
    print(f"Memory usage: {mem:.2f} MB")

# Load saved chunks
with open('/content/drive/MyDrive/split_documents.pkl', 'rb') as f:
    split_documents = pickle.load(f)

print(f"Loaded {len(split_documents)} document chunks for embedding.")

# Initialize HuggingFace embeddings with GPU support (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-MiniLM-L6-v2",
    model_kwargs={"device": device}  # Specify the device
)

# Process embeddings in batches with progress tracking
batch_size = 50000  # Adjust as needed
print("Creating embeddings...")
start_time = time.time()

faiss_index = None
for batch_num, start_idx in enumerate(tqdm(range(0, len(split_documents), batch_size), desc="Processing embeddings")):
    end_idx = min(start_idx + batch_size, len(split_documents))
    batch_docs = split_documents[start_idx:end_idx]

    # Extract text and metadata
    batch_texts = [doc.page_content for doc in batch_docs]
    batch_metadatas = [doc.metadata for doc in batch_docs]  # Includes topic, title, and URL

    # If it's the first batch, initialize FAISS
    if faiss_index is None:
        faiss_index = FAISS.from_texts(
            texts=batch_texts,
            embedding=embedding_model,  # Pass the full embedding model here
            metadatas=batch_metadatas,  # Pass metadata
        )
    else:
        # Add embeddings and metadata incrementally
        faiss_index.add_texts(batch_texts, batch_metadatas)

    # Print progress and elapsed time
    elapsed_time = time.time() - start_time
    print(f"Batch {batch_num + 1} processed in {elapsed_time:.2f} seconds.")

    # Free memory after each batch
    del batch_docs, batch_texts, batch_metadatas
    gc.collect()
    monitor_memory()

# Save FAISS index
faiss_index.save_local("/content/drive/MyDrive/faiss_index")


end_time = time.time()
print(f"Embeddings created and saved to FAISS index in {end_time - start_time:.2f} seconds.")

Loaded 3437657 document chunks for embedding.
Using device: cuda


  embedding_model = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Creating embeddings...


Processing embeddings:   1%|▏         | 1/69 [00:19<22:04, 19.47s/it]

Batch 1 processed in 16.61 seconds.
Memory usage: 6959.98 MB


Processing embeddings:   3%|▎         | 2/69 [00:37<20:51, 18.68s/it]

Batch 2 processed in 34.73 seconds.
Memory usage: 7065.04 MB


Processing embeddings:   4%|▍         | 3/69 [00:55<20:18, 18.47s/it]

Batch 3 processed in 52.87 seconds.
Memory usage: 7167.35 MB


Processing embeddings:   6%|▌         | 4/69 [01:13<19:49, 18.31s/it]

Batch 4 processed in 70.82 seconds.
Memory usage: 7271.51 MB


Processing embeddings:   7%|▋         | 5/69 [01:32<19:34, 18.36s/it]

Batch 5 processed in 89.23 seconds.
Memory usage: 7377.30 MB


Processing embeddings:   9%|▊         | 6/69 [01:50<19:11, 18.28s/it]

Batch 6 processed in 107.37 seconds.
Memory usage: 7473.63 MB


Processing embeddings:  10%|█         | 7/69 [02:08<18:53, 18.28s/it]

Batch 7 processed in 125.55 seconds.
Memory usage: 7584.78 MB


Processing embeddings:  12%|█▏        | 8/69 [02:27<18:35, 18.28s/it]

Batch 8 processed in 143.78 seconds.
Memory usage: 7702.35 MB


Processing embeddings:  13%|█▎        | 9/69 [02:46<18:31, 18.52s/it]

Batch 9 processed in 162.70 seconds.
Memory usage: 7797.53 MB


Processing embeddings:  14%|█▍        | 10/69 [03:04<18:16, 18.58s/it]

Batch 10 processed in 181.40 seconds.
Memory usage: 7892.29 MB


Processing embeddings:  16%|█▌        | 11/69 [03:23<18:01, 18.65s/it]

Batch 11 processed in 200.08 seconds.
Memory usage: 7987.78 MB


Processing embeddings:  17%|█▋        | 12/69 [03:42<17:45, 18.69s/it]

Batch 12 processed in 218.82 seconds.
Memory usage: 8084.89 MB
Batch 13 processed in 237.62 seconds.


Processing embeddings:  19%|█▉        | 13/69 [04:01<17:30, 18.75s/it]

Memory usage: 8194.99 MB


Processing embeddings:  20%|██        | 14/69 [04:20<17:16, 18.84s/it]

Batch 14 processed in 256.68 seconds.
Memory usage: 8313.64 MB


Processing embeddings:  22%|██▏       | 15/69 [04:39<17:02, 18.94s/it]

Batch 15 processed in 275.64 seconds.
Memory usage: 8420.07 MB
Batch 16 processed in 294.98 seconds.


Processing embeddings:  23%|██▎       | 16/69 [04:58<16:49, 19.05s/it]

Memory usage: 8514.14 MB


Processing embeddings:  25%|██▍       | 17/69 [05:18<16:47, 19.37s/it]

Batch 17 processed in 314.81 seconds.
Memory usage: 8838.54 MB


Processing embeddings:  26%|██▌       | 18/69 [05:38<16:29, 19.41s/it]

Batch 18 processed in 334.45 seconds.
Memory usage: 8701.07 MB


Processing embeddings:  28%|██▊       | 19/69 [05:57<16:08, 19.36s/it]

Batch 19 processed in 353.67 seconds.
Memory usage: 8800.10 MB


Processing embeddings:  29%|██▉       | 20/69 [06:16<15:47, 19.35s/it]

Batch 20 processed in 372.81 seconds.
Memory usage: 8895.18 MB


Processing embeddings:  30%|███       | 21/69 [06:36<15:30, 19.38s/it]

Batch 21 processed in 392.26 seconds.
Memory usage: 8988.34 MB


Processing embeddings:  32%|███▏      | 22/69 [06:55<15:11, 19.40s/it]

Batch 22 processed in 411.73 seconds.
Memory usage: 9091.45 MB


Processing embeddings:  33%|███▎      | 23/69 [07:15<14:51, 19.37s/it]

Batch 23 processed in 430.96 seconds.
Memory usage: 9185.92 MB


Processing embeddings:  35%|███▍      | 24/69 [07:34<14:31, 19.38s/it]

Batch 24 processed in 450.27 seconds.
Memory usage: 9278.75 MB


Processing embeddings:  36%|███▌      | 25/69 [07:54<14:14, 19.42s/it]

Batch 25 processed in 469.70 seconds.
Memory usage: 9374.88 MB


Processing embeddings:  38%|███▊      | 26/69 [08:13<13:57, 19.47s/it]

Batch 26 processed in 489.26 seconds.
Memory usage: 9469.69 MB


Processing embeddings:  39%|███▉      | 27/69 [08:33<13:42, 19.57s/it]

Batch 27 processed in 508.88 seconds.
Memory usage: 9565.07 MB
Batch 28 processed in 528.76 seconds.


Processing embeddings:  41%|████      | 28/69 [08:53<13:25, 19.64s/it]

Memory usage: 9756.01 MB


Processing embeddings:  42%|████▏     | 29/69 [09:13<13:07, 19.69s/it]

Batch 29 processed in 548.39 seconds.
Memory usage: 9851.21 MB


Processing embeddings:  43%|████▎     | 30/69 [09:32<12:49, 19.74s/it]

Batch 30 processed in 568.30 seconds.
Memory usage: 9945.33 MB


Processing embeddings:  45%|████▍     | 31/69 [09:52<12:31, 19.78s/it]

Batch 31 processed in 588.08 seconds.
Memory usage: 10046.01 MB


Processing embeddings:  46%|████▋     | 32/69 [10:12<12:13, 19.83s/it]

Batch 32 processed in 607.99 seconds.
Memory usage: 10148.19 MB


Processing embeddings:  48%|████▊     | 33/69 [10:34<12:15, 20.44s/it]

Batch 33 processed in 629.77 seconds.
Memory usage: 10258.62 MB


Processing embeddings:  49%|████▉     | 34/69 [10:54<11:51, 20.31s/it]

Batch 34 processed in 649.82 seconds.
Memory usage: 10374.04 MB


Processing embeddings:  51%|█████     | 35/69 [11:14<11:28, 20.26s/it]

Batch 35 processed in 669.87 seconds.
Memory usage: 10492.14 MB


Processing embeddings:  52%|█████▏    | 36/69 [11:35<11:09, 20.28s/it]

Batch 36 processed in 690.09 seconds.
Memory usage: 10611.31 MB
Batch 37 processed in 710.28 seconds.


Processing embeddings:  54%|█████▎    | 37/69 [11:55<10:47, 20.24s/it]

Memory usage: 10727.49 MB


Processing embeddings:  55%|█████▌    | 38/69 [12:15<10:27, 20.23s/it]

Batch 38 processed in 730.46 seconds.
Memory usage: 10845.09 MB


Processing embeddings:  57%|█████▋    | 39/69 [12:35<10:08, 20.28s/it]

Batch 39 processed in 750.79 seconds.
Memory usage: 10962.30 MB


Processing embeddings:  58%|█████▊    | 40/69 [12:56<09:49, 20.32s/it]

Batch 40 processed in 771.12 seconds.
Memory usage: 11080.83 MB


Processing embeddings:  59%|█████▉    | 41/69 [13:16<09:30, 20.37s/it]

Batch 41 processed in 791.58 seconds.
Memory usage: 11201.11 MB


Processing embeddings:  61%|██████    | 42/69 [13:37<09:11, 20.43s/it]

Batch 42 processed in 812.08 seconds.
Memory usage: 11319.27 MB


Processing embeddings:  62%|██████▏   | 43/69 [13:57<08:51, 20.45s/it]

Batch 43 processed in 832.59 seconds.
Memory usage: 11435.40 MB


Processing embeddings:  64%|██████▍   | 44/69 [14:18<08:30, 20.42s/it]

Batch 44 processed in 852.98 seconds.
Memory usage: 11553.57 MB


Processing embeddings:  65%|██████▌   | 45/69 [14:38<08:12, 20.51s/it]

Batch 45 processed in 873.57 seconds.
Memory usage: 11679.71 MB


Processing embeddings:  67%|██████▋   | 46/69 [14:59<07:51, 20.50s/it]

Batch 46 processed in 894.10 seconds.
Memory usage: 11795.89 MB


Processing embeddings:  68%|██████▊   | 47/69 [15:19<07:31, 20.52s/it]

Batch 47 processed in 914.63 seconds.
Memory usage: 11913.00 MB


Processing embeddings:  70%|██████▉   | 48/69 [15:40<07:12, 20.57s/it]

Batch 48 processed in 935.24 seconds.
Memory usage: 12030.25 MB


Processing embeddings:  71%|███████   | 49/69 [16:01<06:51, 20.60s/it]

Batch 49 processed in 955.89 seconds.
Memory usage: 12160.45 MB


Processing embeddings:  72%|███████▏  | 50/69 [16:21<06:31, 20.61s/it]

Batch 50 processed in 976.51 seconds.
Memory usage: 12284.60 MB


Processing embeddings:  74%|███████▍  | 51/69 [16:42<06:11, 20.65s/it]

Batch 51 processed in 997.08 seconds.
Memory usage: 12402.73 MB


Processing embeddings:  75%|███████▌  | 52/69 [17:03<05:50, 20.61s/it]

Batch 52 processed in 1017.69 seconds.
Memory usage: 12521.66 MB


Processing embeddings:  77%|███████▋  | 53/69 [17:23<05:30, 20.63s/it]

Batch 53 processed in 1038.27 seconds.
Memory usage: 12638.79 MB


Processing embeddings:  78%|███████▊  | 54/69 [17:44<05:09, 20.66s/it]

Batch 54 processed in 1059.02 seconds.
Memory usage: 12757.93 MB
Batch 55 processed in 1079.77 seconds.


Processing embeddings:  80%|███████▉  | 55/69 [18:05<04:49, 20.71s/it]

Memory usage: 12885.11 MB


Processing embeddings:  81%|████████  | 56/69 [18:26<04:31, 20.91s/it]

Batch 56 processed in 1100.77 seconds.
Memory usage: 13162.33 MB


Processing embeddings:  83%|████████▎ | 57/69 [18:47<04:10, 20.91s/it]

Batch 57 processed in 1122.00 seconds.
Memory usage: 13278.48 MB


Processing embeddings:  84%|████████▍ | 58/69 [19:08<03:50, 20.94s/it]

Batch 58 processed in 1142.88 seconds.
Memory usage: 13395.62 MB


Processing embeddings:  86%|████████▌ | 59/69 [19:29<03:29, 20.96s/it]

Batch 59 processed in 1163.94 seconds.
Memory usage: 13512.74 MB
Batch 60 processed in 1185.05 seconds.


Processing embeddings:  87%|████████▋ | 60/69 [19:50<03:09, 21.02s/it]

Memory usage: 13629.88 MB


Processing embeddings:  88%|████████▊ | 61/69 [20:11<02:48, 21.03s/it]

Batch 61 processed in 1206.10 seconds.
Memory usage: 13749.03 MB


Processing embeddings:  90%|████████▉ | 62/69 [20:33<02:27, 21.08s/it]

Batch 62 processed in 1227.27 seconds.
Memory usage: 13866.18 MB


Processing embeddings:  91%|█████████▏| 63/69 [20:54<02:06, 21.12s/it]

Batch 63 processed in 1248.44 seconds.
Memory usage: 13983.36 MB


Processing embeddings:  93%|█████████▎| 64/69 [21:15<01:45, 21.14s/it]

Batch 64 processed in 1269.57 seconds.
Memory usage: 14100.51 MB


Processing embeddings:  94%|█████████▍| 65/69 [21:40<01:28, 22.24s/it]

Batch 65 processed in 1294.12 seconds.
Memory usage: 14226.70 MB


Processing embeddings:  96%|█████████▌| 66/69 [22:01<01:06, 22.02s/it]

Batch 66 processed in 1315.76 seconds.
Memory usage: 14344.04 MB


Processing embeddings:  97%|█████████▋| 67/69 [22:23<00:43, 21.84s/it]

Batch 67 processed in 1337.13 seconds.
Memory usage: 14463.17 MB


Processing embeddings:  99%|█████████▊| 68/69 [22:44<00:21, 21.72s/it]

Batch 68 processed in 1358.53 seconds.
Memory usage: 14579.33 MB


Processing embeddings: 100%|██████████| 69/69 [23:02<00:00, 20.04s/it]

Batch 69 processed in 1376.32 seconds.
Memory usage: 14635.23 MB





Embeddings created and saved to FAISS index in 1420.77 seconds.


In [9]:
# Load FAISS index from disk
db_folder = "/content/drive/MyDrive/faiss_index"
vectorstore = FAISS.load_local(db_folder, embedding_model, allow_dangerous_deserialization=True)
print("Loaded FAISS index from disk.")

# User query


Loaded FAISS index from disk.


In [12]:
# User query
user_query = input("Enter your question: ")

# Retrieve the most relevant context using Maximum Marginal Relevance
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3, "fetch_k": 4, "lambda_mult": 0.4})

start_time_query = time.time()
results = retriever.get_relevant_documents(user_query)
end_time_query = time.time()

print(f"\nQuery processed in {end_time_query - start_time_query:.4f} seconds.")

# Extract and display context with metadata
if results:
    print("\nRetrieved Results:")
    for idx, result in enumerate(results, start=1):
        print(f"\nResult {idx}:")
        print(f"Context: {result.page_content}")
        print(f"Title: {result.metadata.get('title', 'Unknown Title')}")
        print(f"URL: {result.metadata.get('url', 'No URL')}")
        print(f"Topic: {result.metadata.get('topic', 'Unknown Topic')}")
else:
    print("\nNo relevant context found.")

# Free memory after query processing
del results, retriever
gc.collect()

Enter your question: Yellow fever

Query processed in 0.4555 seconds.

Retrieved Results:

Result 1:
Context: yellow fever is a viral disease of typically short duration in most cases symptoms include fever chills loss of appetite nausea muscle pains particularly in the back and headaches symptoms typically
Title: Yellow fever
URL: https://en.wikipedia.org/wiki/Yellow_fever
Topic: Health

Result 2:
Context: yellow fever is common in tropical and subtropical areas of south america and africa worldwide about 600 million people live in endemic areas the who estimates 200 000 cases of yellow fever worldwide
Title: Yellow fever
URL: https://en.wikipedia.org/wiki/Yellow_fever
Topic: Health

Result 3:
Context: to the yellow fever vaccine is known as yellow fever vaccine associated acute neurotropic disease yel and the canadian medical association published a 2001 cmaj article entitled yellow fever
Title: Yellow fever vaccine
URL: https://en.wikipedia.org/wiki/Yellow_fever_vaccine
Topic: Healt

0