In [8]:
pip install trectools

Defaulting to user installation because normal site-packages is not writeable
Collecting trectools
  Downloading trectools-0.0.50.tar.gz (29 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting sarge>=0.1.1 (from trectools)
  Downloading sarge-0.1.7.post1-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting bs4>=0.0.0.1 (from trectools)
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Downloading sarge-0.1.7.post1-py2.py3-none-any.whl (18 kB)
Building wheels for collected packages: trectools
  Building wheel for trectools (setup.py): started
  Building wheel for trectools (setup.py): finished with status 'done'
  Created wheel for trectools: filename=trectools-0.0.50-py3-none-any.whl size=28590 sha256=06f27cbf2cc8fb68ce55966f4e5f7656226b853aaca450c1ae18e909e970a3ec
  Stored in directory: c:\users\mohammad mihdi\appdata\local\pip\cache\wheels\67\23\68\7e98dcd

In [2]:
!pip install symspellpy

Defaulting to user installation because normal site-packages is not writeable
Collecting symspellpy
  Downloading symspellpy-6.9.0-py3-none-any.whl.metadata (3.9 kB)
Collecting editdistpy>=0.1.3 (from symspellpy)
  Downloading editdistpy-0.1.6-cp312-cp312-win_amd64.whl.metadata (6.8 kB)
Downloading symspellpy-6.9.0-py3-none-any.whl (2.6 MB)
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   ---- ----------------------------------- 0.3/2.6 MB ? eta -:--:--
   ---- ----------------------------

In [3]:
from symspellpy.symspellpy import SymSpell

In [None]:
# =============================================
# 📦 المكتبات المطلوبة
# =============================================
import ir_datasets
import numpy as np
import os
import joblib
import json
import re
import html
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from trectools import TrecQrel, TrecRun, TrecEval 
from tabulate import tabulate
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from joblib import Memory
import sys
sys.path.append("..")

# =============================================
# ⚙️ تهيئة الكاش
# =============================================
memory = Memory(location='./cache', verbose=0)

# =============================================
# ⚙️ تحميل بيانات ir_datasets (MSMARCO)
# =============================================
dataset = ir_datasets.load("msmarco-passage/train")
queries_path = os.path.expanduser("~/.ir_datasets/msmarco-passage/train/queries.tsv")

queries = {}
with open(queries_path, 'r', encoding='utf-8', errors='ignore') as f:
    for line in f:
        parts = line.strip().split("\t")
        if len(parts) >= 2:
            queries[parts[0]] = parts[1]

qrels = {}
for qrel in dataset.qrels_iter():
    if qrel.relevance > 0:
        qrels.setdefault(qrel.query_id, set()).add(qrel.doc_id)

# =============================================
# 🧼 دالة التنظيف
# =============================================
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")

def advanced_preprocess(text):
    text = html.unescape(text)
    text = ''.join(c for c in text if c.isprintable())
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'(.)\1{2,}', r'\1', text)
    text = re.sub(r'\s+', ' ', text).strip()
    words = [stemmer.stem(w) for w in text.split() if w not in stop_words and len(w) > 2]
    return ' '.join(words)

# =============================================
# تحميل ملفات التمثيلات
# =============================================
tfidf_doc_ids = joblib.load(r"../data/msmarco_train/index/TFIDF/doc_ids_msmarco_train.joblib")
tfidf_matrix = joblib.load(r"../data/msmarco_train/index/TFIDF/tfidf_matrix_msmarco_train.joblib")
tfidf_vectorizer = joblib.load(r"../data/msmarco_train/index/TFIDF/tfidf_vectorizer_msmarco_train.joblib")
inverted_index_data = joblib.load(r"../data/msmarco_train/index/TFIDF/tfidf_inverted_index.joblib")

bert_embeddings = np.load(r"../data/msmarco_train/index/bert/bert_embeddings.npy")
bert_doc_ids = joblib.load(r"../data/msmarco_train/index/bert/doc_ids.joblib")
bert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

docs_dict = {}
with open(r"../data/msmarco_train/raw/raw_msmarco_train.json", "r", encoding="utf-8") as f:
    for line in f:
        try:
            j = json.loads(line)
            docs_dict[str(j["id"])] = j["text"]
        except:
            continue

# =============================================
# ✳️ فلترة qrels و queries
# =============================================
available_doc_ids = set(docs_dict.keys())
filtered_qrels = {
    qid: {docid for docid in docids if docid in available_doc_ids}
    for qid, docids in qrels.items()
}
filtered_qrels = {qid: docids for qid, docids in filtered_qrels.items() if docids}
filtered_queries = {qid: queries[qid] for qid in filtered_qrels}

qrels = filtered_qrels
queries = filtered_queries

# أخذ أول 5000 استعلام فقط
sample_queries = dict(list(queries.items())[:10000])

# =============================================
# 🔍 دوال البحث الأصلية
# =============================================
def search_tfidf_with_inverted_index(query, inverted_index_data, tfidf_vectorizer, tfidf_matrix, doc_ids, docs_dict, top_k=10, candidate_size=100):
    cleaned_query = advanced_preprocess(query)
    query_terms = cleaned_query.split()
    if not query_terms:
        return []

    doc_scores = {}
    for term in query_terms:
        if term in inverted_index_data["inverted_index"]:
            postings = inverted_index_data["inverted_index"][term]
            for doc_id, score in postings:
                doc_scores[doc_id] = doc_scores.get(doc_id, 0) + score

    candidate_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)[:candidate_size]
    doc_id_to_index = {doc_id: idx for idx, doc_id in enumerate(doc_ids)}
    candidate_indices = [doc_id_to_index[doc_id] for doc_id, _ in candidate_docs if doc_id in doc_id_to_index]

    if not candidate_indices:
        return []

    candidate_tfidf_matrix = tfidf_matrix[candidate_indices]
    query_vector = tfidf_vectorizer.transform([cleaned_query])
    cosine_scores = cosine_similarity(query_vector, candidate_tfidf_matrix).flatten()
    top_indices = cosine_scores.argsort()[::-1][:top_k]

    results = []
    for idx in top_indices:
        doc_idx = candidate_indices[idx]
        doc_id = doc_ids[doc_idx]
        doc_text = docs_dict.get(doc_id, "")
        score = cosine_scores[idx]
        results.append((doc_id, doc_text, score))
    return results

def search_bert(query, top_k=10):
    query_embedding = bert_model.encode([query])
    bert_scores = cosine_similarity(query_embedding, bert_embeddings).flatten()
    top_indices = np.argsort(bert_scores)[::-1][:top_k]
    results = [(bert_doc_ids[i], docs_dict.get(bert_doc_ids[i], ""), bert_scores[i]) for i in top_indices]
    return results

def search_hybrid(query, tfidf_weight=0.5, bert_weight=0.5, top_k=10):
    tfidf_scores = cosine_similarity(tfidf_vectorizer.transform([advanced_preprocess(query)]), tfidf_matrix).flatten()
    bert_scores = cosine_similarity(bert_model.encode([query]), bert_embeddings).flatten()
    combined_scores = tfidf_weight * tfidf_scores + bert_weight * bert_scores
    top_indices = np.argsort(combined_scores)[::-1][:top_k]
    results = [(tfidf_doc_ids[i], docs_dict.get(tfidf_doc_ids[i], ""), combined_scores[i]) for i in top_indices]
    return results

# =============================================
# 🧠 تغليف بالكاش
# =============================================
@memory.cache
def cached_search_tfidf(query, top_k=10, candidate_size=100):
    return search_tfidf_with_inverted_index(query, inverted_index_data, tfidf_vectorizer, tfidf_matrix, tfidf_doc_ids, docs_dict, top_k, candidate_size)

@memory.cache
def cached_search_bert(query, top_k=10):
    return search_bert(query, top_k)

@memory.cache
def cached_search_hybrid(query, tfidf_weight=0.5, bert_weight=0.5, top_k=10):
    return search_hybrid(query, tfidf_weight, bert_weight, top_k)

# =============================================
# 📁 دوال كتابة run و qrel
# =============================================
def write_qrel_file(qrels, filepath):
    with open(filepath, "w") as f:
        for qid, docids in qrels.items():
            for docid in docids:
                f.write(f"{qid} 0 {docid} 1\n")

def write_run_file_threaded(search_fn, queries, run_name, filepath, top_k=10, max_workers=8):
    with open(filepath, "w") as f:
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = {
                executor.submit(search_fn, query, top_k=top_k): qid
                for qid, query in queries.items()
            }
            for future in tqdm(as_completed(futures), total=len(futures), desc=f"Running {run_name}"):
                qid = futures[future]
                try:
                    results = future.result()
                    for rank, (doc_id, _, score) in enumerate(results, start=1):
                        f.write(f"{qid} Q0 {doc_id} {rank} {score} {run_name}\n")
                except Exception as e:
                    print(f"⚠️ Error in query {qid}: {e}")

# =============================================
# 📈 التقييم
# =============================================
qrel_path = "filtered_msmarco.qrel"
run_tfidf_path = "run_tfidf.txt"
run_bert_path = "run_bert.txt"
run_hybrid_path = "run_hybrid.txt"

write_qrel_file(qrels, qrel_path)
write_run_file_threaded(cached_search_tfidf, sample_queries, "TFIDF", run_tfidf_path, top_k=10)
write_run_file_threaded(cached_search_bert, sample_queries, "BERT", run_bert_path, top_k=10)
write_run_file_threaded(lambda q, top_k=10: cached_search_hybrid(q, tfidf_weight=0.4, bert_weight=0.6, top_k=top_k), sample_queries, "Hybrid", run_hybrid_path, top_k=10)

qrel = TrecQrel(qrel_path)
runs = {
    "TFIDF": TrecRun(run_tfidf_path),
    "BERT": TrecRun(run_bert_path),
    "Hybrid": TrecRun(run_hybrid_path),
}

results_table = []

for model_name, run in runs.items():
    evaluation = TrecEval(run, qrel)
    model_results = {
        "Model": model_name,
        "MAP": evaluation.get_map(),
        "MRR": evaluation.get_reciprocal_rank(),
        "P@10": evaluation.get_precision(10),
        "Recall": evaluation.get_recall(1000)
    }
    results_table.append(model_results)

print("\n📊 Evaluation Results:")
print(tabulate(results_table, headers="keys", tablefmt="fancy_grid", floatfmt=".4f"))

for path in [qrel_path, run_tfidf_path, run_bert_path, run_hybrid_path]:
    try:
        os.remove(path)
    except OSError as e:
        print(f"⚠️ فشل حذف {path}: {e}")


Running TFIDF: 100%|████████████████████████████████████████████████████████████| 10000/10000 [01:09<00:00, 143.88it/s]
Running BERT: 100%|█████████████████████████████████████████████████████████████| 10000/10000 [01:34<00:00, 105.27it/s]
Running Hybrid: 100%|███████████████████████████████████████████████████████████| 10000/10000 [01:16<00:00, 131.19it/s]
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()



📊 Evaluation Results:
╒═════════╤════════╤════════╤════════╤══════════╕
│ Model   │    MAP │    MRR │   P@10 │   Recall │
╞═════════╪════════╪════════╪════════╪══════════╡
│ TFIDF   │ 0.2682 │ 0.2734 │ 0.0599 │   0.5729 │
├─────────┼────────┼────────┼────────┼──────────┤
│ BERT    │ 0.2644 │ 0.2702 │ 0.0567 │   0.5394 │
├─────────┼────────┼────────┼────────┼──────────┤
│ Hybrid  │ 0.3438 │ 0.3501 │ 0.0733 │   0.6983 │
╘═════════╧════════╧════════╧════════╧══════════╛


In [None]:
# =============================================
# 📦 المكتبات المطلوبة
# =============================================
import ir_datasets
import numpy as np
import os
import joblib
import json
import re
import html
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from trectools import TrecQrel, TrecRun, TrecEval 
from tabulate import tabulate
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from joblib import Memory
import sys
sys.path.append("..")
from services.vector_store import VectorStore 
# =============================================
# ⚙️ تهيئة الكاش
# =============================================
memory = Memory(location='./cache', verbose=0)

# =============================================
# ⚙️ تحميل بيانات ir_datasets (MSMARCO)
# =============================================
dataset = ir_datasets.load("msmarco-passage/train")
queries_path = os.path.expanduser("~/.ir_datasets/msmarco-passage/train/queries.tsv")

queries = {}
with open(queries_path, 'r', encoding='utf-8', errors='ignore') as f:
    for line in f:
        parts = line.strip().split("\t")
        if len(parts) >= 2:
            queries[parts[0]] = parts[1]

qrels = {}
for qrel in dataset.qrels_iter():
    if qrel.relevance > 0:
        qrels.setdefault(qrel.query_id, set()).add(qrel.doc_id)

# =============================================
# 🧼 دالة التنظيف
# =============================================
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")

def advanced_preprocess(text):
    text = html.unescape(text)
    text = ''.join(c for c in text if c.isprintable())
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'(.)\1{2,}', r'\1', text)
    text = re.sub(r'\s+', ' ', text).strip()
    words = [stemmer.stem(w) for w in text.split() if w not in stop_words and len(w) > 2]
    return ' '.join(words)

# =============================================
# تحميل ملفات التمثيلات
# =============================================
tfidf_doc_ids = joblib.load(r"../data/msmarco_train/index/TFIDF/doc_ids_msmarco_train.joblib")
tfidf_matrix = joblib.load(r"../data/msmarco_train/index/TFIDF/tfidf_matrix_msmarco_train.joblib")
tfidf_vectorizer = joblib.load(r"../data/msmarco_train/index/TFIDF/tfidf_vectorizer_msmarco_train.joblib")
inverted_index_data = joblib.load(r"../data/msmarco_train/index/TFIDF/tfidf_inverted_index.joblib")

bert_embeddings = np.load(r"../data/msmarco_train/index/bert/bert_embeddings.npy")
bert_doc_ids = joblib.load(r"../data/msmarco_train/index/bert/doc_ids.joblib")
bert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
vector_store_path = r"../data/msmarco_train/index/vector_store"
vector_store = VectorStore.load(vector_store_path)

docs_dict = {}
with open(r"../data/msmarco_train/raw/raw_msmarco_train.json", "r", encoding="utf-8") as f:
    for line in f:
        try:
            j = json.loads(line)
            docs_dict[str(j["id"])] = j["text"]
        except:
            continue

# =============================================
# ✳️ فلترة qrels و queries
# =============================================
available_doc_ids = set(docs_dict.keys())
filtered_qrels = {
    qid: {docid for docid in docids if docid in available_doc_ids}
    for qid, docids in qrels.items()
}
filtered_qrels = {qid: docids for qid, docids in filtered_qrels.items() if docids}
filtered_queries = {qid: queries[qid] for qid in filtered_qrels}

qrels = filtered_qrels
queries = filtered_queries

# أخذ أول 5000 استعلام فقط
sample_queries = dict(list(queries.items())[:10000])

# =============================================
# 🔍 دوال البحث الأصلية
# =============================================
def search_tfidf_with_inverted_index(query, inverted_index_data, tfidf_vectorizer, tfidf_matrix, doc_ids, docs_dict, top_k=10, candidate_size=100):
    cleaned_query = advanced_preprocess(query)
    query_terms = cleaned_query.split()
    if not query_terms:
        return []

    doc_scores = {}
    for term in query_terms:
        if term in inverted_index_data["inverted_index"]:
            postings = inverted_index_data["inverted_index"][term]
            for doc_id, score in postings:
                doc_scores[doc_id] = doc_scores.get(doc_id, 0) + score

    candidate_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)[:candidate_size]
    doc_id_to_index = {doc_id: idx for idx, doc_id in enumerate(doc_ids)}
    candidate_indices = [doc_id_to_index[doc_id] for doc_id, _ in candidate_docs if doc_id in doc_id_to_index]

    if not candidate_indices:
        return []

    candidate_tfidf_matrix = tfidf_matrix[candidate_indices]
    query_vector = tfidf_vectorizer.transform([cleaned_query])
    cosine_scores = cosine_similarity(query_vector, candidate_tfidf_matrix).flatten()
    top_indices = cosine_scores.argsort()[::-1][:top_k]

    results = []
    for idx in top_indices:
        doc_idx = candidate_indices[idx]
        doc_id = doc_ids[doc_idx]
        doc_text = docs_dict.get(doc_id, "")
        score = cosine_scores[idx]
        results.append((doc_id, doc_text, score))
    return results

def search_bert(query, top_k=10):
     query_embedding = bert_model.encode([query],normalize_embeddings=True).astype(np.float32)
     return vector_store.search(query_embedding, top_k=top_k)

def search_hybrid(query, tfidf_weight=0.5, bert_weight=0.5, top_k=10):
    tfidf_scores = cosine_similarity(tfidf_vectorizer.transform([advanced_preprocess(query)]), tfidf_matrix).flatten()
    query_embedding = bert_model.encode([query], normalize_embeddings=True).astype(np.float32)
    # Initialize empty BERT scores
    bert_scores = np.zeros_like(tfidf_scores)
    top_bert_results = vector_store.search(query_embedding, top_k=top_k * 20)
        # أنشئ mapping أسرع
    doc_id_to_index = {doc_id: idx for idx, doc_id in enumerate(bert_doc_ids)}

    for doc_id, _, score in top_bert_results:
        idx = doc_id_to_index.get(doc_id)
        if idx is not None:
           bert_scores[idx] = score
            
    if tfidf_doc_ids != bert_doc_ids:
        raise ValueError("قوائم doc_ids غير متطابقة بين النموذجين!")
        
    combined_scores = tfidf_weight * tfidf_scores + bert_weight * bert_scores
    top_indices = np.argsort(combined_scores)[::-1][:top_k]
    results = [(tfidf_doc_ids[i], docs_dict.get(tfidf_doc_ids[i], ""), combined_scores[i]) for i in top_indices]
    return results

# =============================================
# 🧠 تغليف بالكاش
# =============================================
@memory.cache
def cached_search_tfidf(query, top_k=10, candidate_size=100):
    return search_tfidf_with_inverted_index(query, inverted_index_data, tfidf_vectorizer, tfidf_matrix, tfidf_doc_ids, docs_dict, top_k, candidate_size)

@memory.cache
def cached_search_bert(query, top_k=10):
    return search_bert(query, top_k)

@memory.cache
def cached_search_hybrid(query, tfidf_weight=0.5, bert_weight=0.5, top_k=10):
    return search_hybrid(query, tfidf_weight, bert_weight, top_k)

# =============================================
# 📁 دوال كتابة run و qrel
# =============================================
def write_qrel_file(qrels, filepath):
    with open(filepath, "w") as f:
        for qid, docids in qrels.items():
            for docid in docids:
                f.write(f"{qid} 0 {docid} 1\n")

def write_run_file_threaded(search_fn, queries, run_name, filepath, top_k=10, max_workers=8):
    with open(filepath, "w") as f:
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = {
                executor.submit(search_fn, query, top_k=top_k): qid
                for qid, query in queries.items()
            }
            for future in tqdm(as_completed(futures), total=len(futures), desc=f"Running {run_name}"):
                qid = futures[future]
                try:
                    results = future.result()
                    for rank, (doc_id, _, score) in enumerate(results, start=1):
                        f.write(f"{qid} Q0 {doc_id} {rank} {score} {run_name}\n")
                except Exception as e:
                    print(f"⚠️ Error in query {qid}: {e}")

# =============================================
# 📈 التقييم
# =============================================
qrel_path = "filtered_msmarco.qrel"
run_tfidf_path = "run_tfidf.txt"
run_bert_path = "run_bert.txt"
run_hybrid_path = "run_hybrid.txt"

write_qrel_file(qrels, qrel_path)
write_run_file_threaded(cached_search_tfidf, sample_queries, "TFIDF", run_tfidf_path, top_k=10)
write_run_file_threaded(cached_search_bert, sample_queries, "BERT", run_bert_path, top_k=10)
write_run_file_threaded(lambda q, top_k=10: cached_search_hybrid(q, tfidf_weight=0.4, bert_weight=0.6, top_k=top_k), sample_queries, "Hybrid", run_hybrid_path, top_k=10)

qrel = TrecQrel(qrel_path)
runs = {
    "TFIDF": TrecRun(run_tfidf_path),
    "BERT": TrecRun(run_bert_path),
    "Hybrid": TrecRun(run_hybrid_path),
}

results_table = []

for model_name, run in runs.items():
    evaluation = TrecEval(run, qrel)
    model_results = {
        "Model": model_name,
        "MAP": evaluation.get_map(),
        "MRR": evaluation.get_reciprocal_rank(),
        "P@10": evaluation.get_precision(10),
        "Recall": evaluation.get_recall(1000)
    }
    results_table.append(model_results)

print("\n📊 Evaluation Results:")
print(tabulate(results_table, headers="keys", tablefmt="fancy_grid", floatfmt=".4f"))

for path in [qrel_path, run_tfidf_path, run_bert_path, run_hybrid_path]:
    try:
        os.remove(path)
    except OSError as e:
        print(f"⚠️ فشل حذف {path}: {e}")
