In [2]:
!pip install langchain sentence-transformers torch transformers pymongo python-dotenv

Collecting pymongo
  Downloading pymongo-4.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Using cached nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia

In [7]:
!pip install -U langchain-huggingface

Collecting langchain-huggingface
  Downloading langchain_huggingface-0.2.0-py3-none-any.whl.metadata (941 bytes)
Downloading langchain_huggingface-0.2.0-py3-none-any.whl (27 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-0.2.0


In [40]:
import json
from pymongo import MongoClient
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import MongoDBAtlasVectorSearch
from langchain.schema import Document
from google.colab import userdata
import torch
import logging
from typing import List, Dict, Any

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def connect_to_mongodb() -> MongoClient:
    """Connect to MongoDB Atlas using Colab secrets."""
    try:
        mongo_uri = userdata.get('mongodb')
        if not mongo_uri:
            raise ValueError("MONGO_URI not found in Colab secrets.")

        client = MongoClient(mongo_uri)
        # Test the connection
        client.admin.command('ping')
        logger.info("Successfully connected to MongoDB")
        return client
    except Exception as e:
        logger.error(f"Error connecting to MongoDB: {str(e)}")
        raise

def ingest_raw_chunks(client: MongoClient, file_path: str):
    """Ingest raw chunks into MongoDB without embeddings."""
    try:
        # Get database and collection
        db = client['HCMIU_Data']
        collection = db['Data']

        # Load and process the JSON file
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # Prepare chunks for insertion
        chunks_to_insert = []
        for chunk in data['chunks']:
            chunk_doc = {
                'title': chunk['title'],
                'content': chunk['content'],
                'document_id': chunk['document_id'],
                'document_type': chunk['document_type'],
                'chunk_id': chunk['chunk_id'],
                'semantic_id': chunk['semantic_id'],
                'type': chunk['type'],
                'has_embedding': False  # Flag to track embedding status
            }
            chunks_to_insert.append(chunk_doc)

        # Insert chunks in batches
        batch_size = 100
        for i in range(0, len(chunks_to_insert), batch_size):
            batch = chunks_to_insert[i:i + batch_size]
            collection.insert_many(batch)
            logger.info(f"Inserted {len(batch)} chunks (batch {i//batch_size + 1})")

        logger.info(f"Successfully ingested {len(chunks_to_insert)} raw chunks")

    except Exception as e:
        logger.error(f"Error ingesting raw chunks: {str(e)}")
        raise

def add_embeddings_to_chunks(client: MongoClient):
    """Add embeddings to existing chunks in MongoDB."""
    try:
        # Get database and collection
        db = client['HCMIU_Data']
        collection = db['Data']

        # Initialize embedding model
        embeddings = HuggingFaceEmbeddings(
            model_name="keepitreal/vietnamese-sbert",
            model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'},
            encode_kwargs={'normalize_embeddings': True}
        )

        # Get chunks without embeddings
        chunks_without_embeddings = collection.find({'has_embedding': False})

        # Process chunks in batches
        batch_size = 100
        current_batch = []

        for chunk in chunks_without_embeddings:
            # Create text for embedding
            text = f"{chunk['title']}\n{chunk['content']}"

            # Generate embedding
            embedding = embeddings.embed_query(text)

            # Update document with embedding
            collection.update_one(
                {'_id': chunk['_id']},
                {
                    '$set': {
                        'embedding': embedding,
                        'has_embedding': True
                    }
                }
            )

            current_batch.append(chunk['_id'])

            # Process batch
            if len(current_batch) >= batch_size:
                logger.info(f"Processed {len(current_batch)} chunks with embeddings")
                current_batch = []

        # Process remaining chunks
        if current_batch:
            logger.info(f"Processed {len(current_batch)} remaining chunks with embeddings")

        logger.info("Successfully added embeddings to all chunks")

    except Exception as e:
        logger.error(f"Error adding embeddings: {str(e)}")
        raise

def main():
    try:
        # Connect to MongoDB
        client = connect_to_mongodb()

        # Step 1: Ingest raw chunks
        logger.info("Starting raw chunk ingestion...")
        ingest_raw_chunks(client, '/content/combined_chunks.json')

        # Step 2: Add embeddings
        logger.info("Starting embedding generation...")
        add_embeddings_to_chunks(client)

        logger.info("Process completed successfully!")

    except Exception as e:
        logger.error(f"Error in main process: {str(e)}")
        raise
    finally:
        client.close()

if __name__ == "__main__":
    main()