In [1]:
!pip install numpy
!pip install chromadb
!pip install sentence-transformers
!pip install langchain
!pip install openai
!pip install python-dotenv
!pip install tiktoken
!pip install langchain-community
!pip install datasets
!pip install optimum
!pip install bitsandbytes
!pip install bert_score
!pip install transformers
!pip install trl

# Step 1 - Importing The Libraries

In [None]:
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain.vectorstores.utils import maximal_marginal_relevance

from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
from datasets import load_dataset
import chromadb

from langchain.chains.query_constructor.schema import AttributeInfo
from langchain.chains import RetrievalQA
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.schema import BaseRetriever, Document

from dataclasses import dataclass
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from peft import LoraConfig, get_peft_model, PeftModel, PeftConfig

import os
import numpy as np
import json
import pandas as pd
import time

# Step 2 - Loading Data

In [None]:
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain.vectorstores.utils import maximal_marginal_relevance

from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
from datasets import load_dataset
import chromadb

from langchain.chains.query_constructor.schema import AttributeInfo
from langchain.chains import RetrievalQA
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.schema import BaseRetriever, Document

from dataclasses import dataclass
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from peft import LoraConfig, get_peft_model, PeftModel, PeftConfig

import os
import numpy as np
import json
import pandas as pd
import time

In [None]:
def load_the_data(file_path):
    """Load and process the JSON file"""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            
        print(f"Successfully loaded {len(data)} documents")
        return data
        
    except FileNotFoundError:
        print(f"File {file_path} not found!")
        return []
    except json.JSONDecodeError as e:
        print(f"JSON decode error: {e}")
        return []
    except Exception as e:
        print(f"Unexpected error: {e}")
        return []

def analyze_the_data(data):
    """Analyze the loaded data"""
    if not data:
        return
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    print("\nData Analysis:")
    print(f"Total documents: {len(df)}")
    print(f"Columns: {df.columns.tolist()}")
    
    # Calculate text lengths
    df['title_length'] = df['Title'].apply(len)
    df['context_length'] = df['Context'].apply(len)
    
    print(f"\nText Length Statistics:")
    print(f"Average title length: {df['title_length'].mean():.1f} characters")
    print(f"Average context length: {df['context_length'].mean():.1f} characters")
    print(f"Total context characters: {df['context_length'].sum():,}")
    
    return df

def display_documents(data, max_display=5):
    """Display the documents in a readable format"""
    print(f"\nDocuments (showing first {max_display}):")
    print("-" * 50)
    
    for i, doc in enumerate(data[:max_display]):
        print(f"\n{i+1}. {doc['Title']}")
        print(f"   {doc['Context'][:100]}...")
        print(f"   Length: {len(doc['Context'])} characters")
        print("-" * 30)

In [None]:
file_path = "/kaggle/input/laws-data/laws.json"  # Replace with your actual file path
    
# Load the data
whole_data = load_the_data(file_path)

In [None]:
if whole_data:    
    # Display documents
    display_documents(whole_data)

    # Show Some Analysis
    analyze_the_data(whole_data)

# Step 3 - Splitting Data

In [None]:
# train_data , test_data = train_test_split(whole_data, test_size=0.1, random_state=1)

# Step 4 - Document Creation

In [None]:
from langchain.schema import Document

def Create_documents(whole_data):
    """
    Create documents suitable for embedding models
    Returns: List of Document objects with combined text for embedding
    """
    documents = []
    
    for item in whole_data:
        if isinstance(item, list):
            item = item[0]

        title = item.get("Title", "")
        context = item.get("Context", "")

        if title and context:
            # Combine title and context for better embedding quality
            combined_text = f"{title}\n\n{context}"
            
            documents.append(Document(
                page_content=combined_text,  # This is what gets embedded
                metadata={
                    "title": title,
                    "context": context,
                    "source": "egypt_data",
                    "text_length": len(combined_text)
                }
            ))
    
    return documents

In [None]:
# Usage:
documents = Create_documents(whole_data)

In [None]:
print(f"Documents lenght : {len(documents)}")

for i, doc in enumerate(documents[:3]):
    print(f"\nDocument {i+1}:")
    print(f"Title : {doc.page_content[:100]}...")
    print(f"Meta Data : {doc.metadata}")
    print("-" * 50)

In [None]:
# For embeddings, you'll use:
texts = [doc.page_content for doc in documents] 

print(f"Created {len(documents)} documents for embedding")
print(f"Sample text for embedding: {texts[4][:]}...")

In [None]:
whole_documents

# Step 5 - Embedding Model

In [None]:
model_id = "Alibaba-NLP/gte-multilingual-base"
dim = 768

device = "cuda:0"

In [None]:
model = SentenceTransformer(model_id, device=device)

embeddings = SentenceTransformerEmbeddings(
    model_name=model_id,
    model_kwargs={'device': device}
)

In [None]:
from sentence_transformers.util import batch_to_device
  
def batch_encode(texts, model, batch_size=16):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        emb = model.encode(batch, show_progress_bar=False)
        embeddings.extend(emb)
    return embeddings
 
encoded_docs = batch_encode(list(doc_texts.values()), model, batch_size=16)

In [None]:
Data_docs = [doc for doc in documents]

Data_texts = []

for doc in Data_docs:
    Title = doc.metadata.get("title")
    Context = doc.metadata.get("context")

encoded_data = batch_encode(Data_texts, model, batch_size=16)

encoded_data = [e.tolist() for e in encoded_data]

# Step 6 - Create The Vector Database - ChromaDB

In [None]:
chroma_client = chromadb.PersistentClient(path="./chromadb-ar-docs")

In [None]:
collection = chroma_client.create_collection(
    name="ar_docs",
    metadata={"hnsw:space": "cosine"}
)

In [None]:
collection.add(
    documents=doc_text,
    embeddings=embedding_questions,
    metadatas=QA_metadata,
    ids=ids_to_add
)

In [None]:
collection = chroma_client.get_or_create_collection(name="ar_docs")

In [None]:
vectordb = Chroma(
    collection_name="ar_docs",
    embedding_function=embeddings,
    persist_directory="./chromadb-ar-docs"
)

In [None]:
question = "ما هي مظاهر الحياة السياسية بالدولة المصرية ؟"


question_embed = model.encode([question])[0].tolist()

results = collection.query(
    query_embeddings=[question_embed],
    n_results=3
)

print(results)

In [None]:
if results["documents"]:
    top_match_metadata = results["metadatas"][0][0]
    Matched_Title = top_match_metadata.get("Title", "")
    Context = top_match_metadata.get("Context")

    print("Nearest Matched Titles: ", Matched_Title)
    print("Context Of The Nearest Titles:", Context)
    
else:
    print("No Find Any !!")

# Step 7 - Evaluation The embedding model

### Similarity search

In [None]:
chroma_results = []
qa_embeddings = []  

for embedding in encoded_questions:
    results = vectordb.similarity_search_by_vector(
        embedding=embedding, 
        k=3
    )
    chroma_results.append(results)
    qa_embeddings.append(embedding)

### Accuracy

In [None]:
chroma_insights = {
    "valid": 0,
    "similar": 0,
    "invalid": 0
}

for i in range(len(qa_texts)):
    true_metadata = QA_metadata[i]  

    pred_metadata = chroma_results[i][0].metadata

    true_id = str(true_metadata.get("id", ""))
    pred_id = str(pred_metadata.get("id", ""))

    true_source = true_metadata.get("source", "")
    pred_source = pred_metadata.get("source", "")

    # تصنيف النتائج
    if true_id == pred_id:
        chroma_insights["valid"] += 1
    elif true_source == pred_source:
        chroma_insights["similar"] += 1
    else:
        chroma_insights["invalid"] += 1

# حساب النسب
total = len(qa_texts)
chroma_insights["valid_percentage"] = chroma_insights["valid"] / total
chroma_insights["similar_percentage"] = chroma_insights["similar"] / total
chroma_insights["invalid_percentage"] = chroma_insights["invalid"] / total

# طباعة النتائج
print("Model ID:", model_id)
print("----")
print("Valid:", chroma_insights["valid"])
print("Valid%:", chroma_insights["valid_percentage"])
print("----")
print("Similar:", chroma_insights["similar"])
print("Similar%:", chroma_insights["similar_percentage"])
print("----")
print("Invalid:", chroma_insights["invalid"])
print("Invalid%:", chroma_insights["invalid_percentage"])
print("----")