In [None]:
import os
%pwd

In [2]:
os.chdir("../")

In [None]:
%pwd

In [4]:
from dataclasses import dataclass
from pathlib import Path

In [5]:
@dataclass(frozen=True)
class DataEmbeddingConfig:
    root_dir: Path
    data_path: Path
    model_name: str
    text_column: str

@dataclass(frozen=True)
class VectorStorageConfig:
    data_path: Path
    embedding_dim: int

In [None]:
from dialogue_rag_chatbot.constants import *
from dialogue_rag_chatbot.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_data_embedding_config(self) -> DataEmbeddingConfig:
        config = self.config.data_embedding

        create_directories([config.root_dir])
        data_embedding_config = DataEmbeddingConfig(
            root_dir = config.root_dir,
            data_path = config.data_path,
            model_name = config.model_name,
            text_column= config.text_column
        )

        return data_embedding_config

    def get_vector_storage_config(self)-> VectorStorageConfig:

        config = self.config.vector_storage
        vector_storage_config = VectorStorageConfig(
            data_path=config.data_path,
            embedding_dim=config.embedding_dim
        )

        return vector_storage_config
    


In [None]:
from dialogue_rag_chatbot.logging import logger
from sentence_transformers import SentenceTransformer
from datasets import load_from_disk
from typing import List, Dict, Any, Tuple
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
class EmbeddingModel:
    """Granite embedding model using SentenceTransformers"""
    
    def __init__(self, config: DataEmbeddingConfig):
        self.model_name = config.model_name
        try:
            self.model = SentenceTransformer(self.model_name)
            logger.info(f"Granite embedding model '{self.model_name}' loaded successfully")
        except Exception as e:
            logger.error(f"Failed to load model {self.model_name}: {str(e)}")
            raise
    
    def encode(self, texts):
        """Encode texts into embeddings"""
        try:
            # Granite embedding models return 768-dimensional vectors
            embeddings = self.model.encode(texts, convert_to_numpy=True)
            logger.info(f"Encoded {len(texts)} texts into embeddings of shape {embeddings.shape}")
            return embeddings
        except Exception as e:
            logger.error(f"Error encoding texts: {str(e)}")
            raise


class VectorStorage:
    """Store and retrieve document embeddings"""
    
    def __init__(self, config: VectorStorageConfig):
        self.config = config
        self.embedding_dim = self.config.embedding_dim
        self.embeddings = []
        self.documents = []
        logger.info(f"VectorStore initialized with embedding_dim={self.embedding_dim}")
    
    def add_documents(self, documents: List[Dict[str, Any]], embeddings: np.ndarray):
        """Add documents and their embeddings to the store"""
        if embeddings.shape[1] != self.embedding_dim:
            raise ValueError(f"Embedding dimension mismatch: expected {self.embedding_dim}, got {embeddings.shape[1]}")
        
        self.embeddings.extend(embeddings.tolist())
        self.documents.extend(documents)
        logger.info(f"Added {len(documents)} documents to vector store")
    
    def similarity_search(self, query_embedding: np.ndarray, top_k: int = 5) -> List[Tuple[Dict[str, Any], float]]:
        """Perform similarity search"""
        if not self.embeddings:
            return []
        
        query_embedding = query_embedding.flatten()

        embeddings_matrix = np.array(self.embeddings)
        similarities = np.dot(embeddings_matrix, query_embedding.T) / (
            np.linalg.norm(embeddings_matrix, axis=1) * np.linalg.norm(query_embedding) + 1e-10
        )
        
        # top_indices = np.argsort(similarities)[-top_k:][::-1]
        
        top_indices = np.argpartition(-similarities, top_k)[:top_k]
        top_indices = top_indices[np.argsort(-similarities[top_indices])]

        results = [(self.documents[idx], float(similarities[idx])) for idx in top_indices]
        logger.info(f"Retrieved {len(results)} documents from vector store")
        return results

In [10]:
try:
    config = ConfigurationManager()
    vector_storage_config = config.get_vector_storage_config()
    vector_store = VectorStorage(config = vector_storage_config)

    dataset_with_embeddings = load_from_disk(vector_storage_config.data_path)
    
    documents = [{"id": item["id"], "dialogue": item["dialogue"], "summary": item["summary"]} for item in dataset_with_embeddings]
    embeddings = np.array(dataset_with_embeddings["embedding"])
    vector_store.add_documents(documents, embeddings)



except Exception as e:
    raise e

[2025-09-27 20:16:22,915: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-09-27 20:16:22,918: INFO: common: yaml file: params.yaml loaded successfully]
[2025-09-27 20:16:22,920: INFO: common: created directory at: artifacts]
[2025-09-27 20:16:22,921: INFO: 1311732731: VectorStore initialized with embedding_dim=768]
[2025-09-27 20:16:41,061: INFO: 1311732731: Added 14732 documents to vector store]


In [11]:

try:
    # query_text = "Does anyone who feel tired during works?"
    query_text = "我想知道有誰在討論旅遊很開心，把他們名字都列出來"
    # 產生查詢語句的 embedding
    embedding_model_config = config.get_data_embedding_config()
    embedding_model = EmbeddingModel(config= embedding_model_config)
    query_embedding = embedding_model.encode([query_text])[0]

    search_results = vector_store.similarity_search(query_embedding, top_k=10)
    
    print(f"\nQuery: '{query_text}'")
    print("Top 10 similar dialogues:")
    for doc, score in search_results:
        print(doc)
        print(f"  Score: {score:.4f}")
        print(f"  Dialogue: {doc['dialogue'][:150]}...") # 只顯示部分對話內容
        print("-" * 20)

except Exception as e:
    raise e

[2025-09-27 20:16:41,084: INFO: common: created directory at: artifacts/data_embedding]
[2025-09-27 20:16:41,090: INFO: SentenceTransformer: Use pytorch device_name: cpu]
[2025-09-27 20:16:41,092: INFO: SentenceTransformer: Load pretrained SentenceTransformer: ibm-granite/granite-embedding-278m-multilingual]
[2025-09-27 20:16:51,560: INFO: 1311732731: Granite embedding model 'ibm-granite/granite-embedding-278m-multilingual' loaded successfully]


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.65it/s]

[2025-09-27 20:16:51,857: INFO: 1311732731: Encoded 1 texts into embeddings of shape (1, 768)]





[2025-09-27 20:16:52,463: INFO: 1311732731: Retrieved 10 documents from vector store]

Query: '我想知道有誰在討論旅遊很開心，把他們名字都列出來'
Top 10 similar dialogues:
{'id': '13730805', 'dialogue': "Adam: any idea for holiday this year?\r\nTheo: dunno. i guess ann and the boys would like to fly swh\r\nAdam: in summer?\r\nTheo: dunno. it might be sort of expensive in peak season. \r\nAdam: you mean it's better to go before or after?\r\nTheo: dfntly. we were actually considering october. could be 30% off \r\nAdam: sounds attractive enough. but we can't both go at teh same time right\r\nTheo: yeah, the boss wouldn't be very happy\r\nAdam: perhaps i'll look for sth in June then?\r\nTheo: thats' a thought. any ideas where to?\r\nAdam: in my case it's more of a question who with haha\r\nTheo: you really argued with Layla?\r\nAdam: she's still pretty angry about that party you know\r\nTheo: yeah but June is still a couple of months to go\r\nAdam: we'll need to talk with her about everythihng\r\nTheo: you better 