In [None]:
import chromadb
from chromadb.utils import embedding_functions
from sentence_transformers import SentenceTransformer
import numpy as np, json, os

# 1Ô∏è‚É£ Load your model
model = SentenceTransformer("intfloat/multilingual-e5-large")

# 2Ô∏è‚É£ Load article-based dataset
with open(r"C:\Users\PC\OneDrive\Desktop\chatbot\data\processed\penal_code_articles.json", "r", encoding="utf-8") as f:
    articles = json.load(f)

In [2]:

# 3Ô∏è‚É£ Encode all articles into vectors
embeddings = model.encode(articles, show_progress_bar=True)
embeddings = np.array(embeddings)
print("‚úÖ Embeddings shape:", embeddings.shape)

# 4Ô∏è‚É£ Save them for future use
os.makedirs(r"C:\Users\PC\OneDrive\Desktop\chatbot\data\embeddings\articles", exist_ok=True)
np.save(r"C:\Users\PC\OneDrive\Desktop\chatbot\data\embeddings\articles\penal_code_articles_e5.npy", embeddings)

print("‚úÖ Saved article embeddings successfully!")

Batches:   0%|          | 0/22 [00:00<?, ?it/s]

‚úÖ Embeddings shape: (691, 1024)
‚úÖ Saved article embeddings successfully!


In [3]:
import re
# build new chroma database
# 1Ô∏è‚É£ Load the embeddings and articles
embeddings = np.load(r"C:\Users\PC\OneDrive\Desktop\chatbot\data\embeddings\articles\penal_code_articles_e5.npy")
with open(r"C:\Users\PC\OneDrive\Desktop\chatbot\data\processed\penal_code_articles.json", "r", encoding="utf-8") as f:
    articles = json.load(f)

# 2Ô∏è‚É£ Extract article numbers as metadata
article_re = re.compile(r"ÿßŸÑŸÖÿßÿØÿ©\s*(\d+)")
metadatas = []
for art in articles:
    match = article_re.search(art)
    number = match.group(1) if match else None
    metadatas.append({"article": number})

print("‚úÖ Example metadata:", metadatas[:5])

‚úÖ Example metadata: [{'article': '1'}, {'article': '2'}, {'article': '3'}, {'article': '4'}, {'article': '5'}]


In [4]:
# 3Ô∏è‚É£ Initialize Chroma client & embedding function
embed_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="intfloat/multilingual-e5-large"
)

chroma_client = chromadb.PersistentClient(
    path=r"C:\Users\PC\OneDrive\Desktop\chatbot\data\chroma_db_articles"
)

# Delete old collection if re-running
try:
    chroma_client.delete_collection("penal_code_articles")
except Exception:
    pass

# Create new collection
collection = chroma_client.create_collection(
    name="penal_code_articles",
    embedding_function=embed_func
)

# 4Ô∏è‚É£ Add all articles + embeddings + metadata
ids = [f"article_{i}" for i in range(len(articles))]
collection.add(
    ids=ids,
    documents=articles,
    embeddings=embeddings.tolist(),
    metadatas=metadatas
)

print(f"‚úÖ Added {len(articles)} articles to Chroma (with metadata).")

‚úÖ Added 691 articles to Chroma (with metadata).


In [5]:
query = "ŸÖÿß ŸáŸä ÿßŸÑÿπŸÇŸàÿ®ÿ© ÿπŸÑŸâ ÿßŸÑÿ≥ÿ±ŸÇÿ©ÿü"
results = collection.query(
    query_texts=[query],
    n_results=3,
    include=["documents", "metadatas", "distances"]
)

for i, (doc, meta, dist) in enumerate(zip(results["documents"][0], results["metadatas"][0], results["distances"][0]), 1):
    sim = 1 - dist
    print(f"\nüîπ ÿßŸÑŸÜÿ™Ÿäÿ¨ÿ© {i} | ÿßŸÑŸÖÿßÿØÿ© {meta.get('article')} | ÿ¥ÿ®Ÿá ‚âà {sim:.3f}")
    print(doc[:600])



üîπ ÿßŸÑŸÜÿ™Ÿäÿ¨ÿ© 1 | ÿßŸÑŸÖÿßÿØÿ© 636 | ÿ¥ÿ®Ÿá ‚âà 0.847
ÿßŸÑŸÖÿßÿØÿ© 636- ŸÖÿπÿØŸÑÿ© ŸàŸÅŸÇÿß ŸÑŸÑŸÖÿ±ÿ≥ŸàŸÖ ÿßŸÑÿßÿ¥ÿ™ÿ±ÿßÿπŸä112 ÿ™ÿßÿ±ŸäÿÆ16/9/ 1983 ŸàÿßŸÑŸÇÿßŸÜŸàŸÜ239 ÿ™ÿßÿ±ŸäÿÆ
27/5/ 1993
- ÿßŸÑÿ≥ÿ±ŸÇÿ©ÿå ÿßŸÑÿ™Ÿä ŸÑŸÖ ÿ™ÿ≠ÿØÿØ ŸÑŸáÿß ÿπŸÇŸàÿ®ÿ© ÿÆÿßÿµÿ© ÿ®ŸÖŸàÿ¨ÿ® ÿ£ÿ≠ÿØ ŸÜÿµŸàÿµ Ÿáÿ∞ÿß ÿßŸÑŸÇÿßŸÜŸàŸÜÿå ŸäÿπÿßŸÇÿ® ÿπŸÑŸäŸáÿß ÿ®ÿßŸÑÿ≠ÿ®ÿ≥ ŸÖŸÜ
ÿ¥Ÿáÿ±ŸäŸÜ ÿ•ŸÑŸâ ÿ´ŸÑÿßÿ´ ÿ≥ŸÜŸàÿßÿ™ Ÿàÿ®ÿßŸÑÿ∫ÿ±ÿßŸÖÿ© ŸÖŸÜ ŸÖÿ¶ÿ© ÿ£ŸÑŸÅ ÿ•ŸÑŸâ ÿ£ÿ±ÿ®ÿπŸÖÿßÿ¶ÿ© ÿ£ŸÑŸÅ ŸÑŸäÿ±ÿ©.
Ÿàÿ™ÿ¥ÿØÿØ Ÿáÿ∞Ÿá ÿßŸÑÿπŸÇŸàÿ®ÿ© ŸàŸÅŸÇÿß ŸÑŸÑŸÖÿßÿØÿ© 257 ÿ•ÿ∞ÿß ÿßÿ±ÿ™ŸÉÿ®ÿ™ ÿßŸÑÿ≥ÿ±ŸÇÿ© ŸÅŸä ÿ•ÿ≠ÿØŸâ ÿßŸÑÿ≠ÿßŸÑÿßÿ™ ÿßŸÑÿ™ÿßŸÑŸäÿ©:
1- ŸÅŸä ÿßŸÑŸÖÿπÿßÿ®ÿØ ŸàÿßŸÑÿ£ÿ®ŸÜŸäÿ© ÿßŸÑŸÖÿ£ŸáŸàŸÑÿ©.

2- ÿÆÿ±Ÿâ ÿ£Ÿà ŸÅŸä ÿßŸÑŸÇÿ∑ÿßÿ±ÿßÿ™ ÿ£Ÿà ŸÅŸä ÿßŸÑÿ≥ŸÅŸÜ ÿ£Ÿà ÿ®ŸÜÿ¥ŸÑ ÿßŸÑŸÖÿßÿ±ÿ© ÿ£ŸÉÿßŸÜ ÿ∞ŸÑŸÉ ŸÅŸä ÿßŸÑÿ∑ÿ±ŸÇ ÿ£Ÿà ŸÅŸä ÿßŸÑÿ£ŸÖÿßŸÉŸÜ ÿßŸÑÿπÿßŸÖÿ© ÿßŸÑÿ£
ÿßŸÑÿ∑ÿßÿ¶ÿ±ÿßÿ™ ÿ£Ÿà ÿ∫Ÿäÿ±Ÿáÿß ŸÖŸÜ Ÿàÿ≥ÿßÿ¶ŸÑ ÿßŸÑŸÜŸÇŸÑ.
3- ÿ®ŸÅÿπŸÑ ŸÖŸàÿ∏ŸÅ ÿ£ŸÜŸäÿ∑ ÿ®Ÿá ÿ≠ŸÅÿ∏ ÿßŸÑÿ£ŸÖŸÜ ÿ£Ÿà ÿßŸÑÿ≠ÿ±ÿßÿ≥ÿ© ÿ≠ÿ™Ÿâ Ÿàÿ•ŸÜ ÿßÿ±ÿ™ŸÉÿ®ÿ™ ÿßŸÑÿ≥ÿ