In [1]:
# =============================================
# 📦 المكتبات المطلوبة
# =============================================
import ir_datasets
import numpy as np
import os
import joblib
import json
import re
import html
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from trectools import TrecQrel, TrecRun, TrecEval 
from tabulate import tabulate
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from joblib import Memory
import sys
sys.path.append("..")
from services.documents_service import preprocess
# =============================================
# ⚙️ تهيئة الكاش
# =============================================
memory = Memory(location='./cache', verbose=0)

# =============================================
# ⚙️ تحميل بيانات الكويري من ملف JSON (معدل لملف JSON خطي)
# =============================================

queries_path = r"C:\Users\Mohammad Mihdi\.ir_datasets\beir\quora\queries.json"  # عدل هذا المسار إلى ملف الكويري لديك

queries = {}
with open(queries_path, 'r', encoding='utf-8') as f:
    for line in f:
        try:
            obj = json.loads(line)
            qid = obj["_id"]
            text = obj["text"]
            queries[qid] = text
        except Exception as e:
            print(f"⚠️ خطأ في قراءة السطر: {e}")

# =============================================
# ⚙️ تحميل qrels من ملف test.qrels
# =============================================

dataset = ir_datasets.load("beir/quora/test")
qrels = {}
for qrel in dataset.qrels_iter():
    if qrel.relevance > 0:
        qrels.setdefault(qrel.query_id, set()).add(qrel.doc_id)


# =============================================
# 🧼 دالة التنظيف
# =============================================
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")

def advanced_preprocess(text):
    text = html.unescape(text)
    text = ''.join(c for c in text if c.isprintable())
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'(.)\1{2,}', r'\1', text)
    text = re.sub(r'\s+', ' ', text).strip()
    words = [stemmer.stem(w) for w in text.split() if w not in stop_words and len(w) > 2]
    return ' '.join(words)

# =============================================
# تحميل ملفات التمثيلات
# =============================================
tfidf_doc_ids = joblib.load(r"../data/beir_quora_test/index/TFIDF/doc_ids_beir_quora_test.joblib")
tfidf_matrix = joblib.load(r"../data/beir_quora_test/index/TFIDF/tfidf_matrix_beir_quora_test.joblib")
tfidf_vectorizer = joblib.load(r"../data/beir_quora_test/index/TFIDF/tfidf_vectorizer_beir_quora_test.joblib")
inverted_index_data = joblib.load(r"../data/beir_quora_test/index/TFIDF/tfidf_inverted_index.joblib")

bert_embeddings = np.load(r"../data/beir_quora_test/index/bert/bert_embeddings.npy")
bert_doc_ids = joblib.load(r"../data/beir_quora_test/index/bert/doc_ids.joblib")
bert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

docs_dict = {}
with open(r"../data/beir_quora_test/raw/raw_beir_quora_test.json", "r", encoding="utf-8") as f:
    for line in f:
        try:
            j = json.loads(line)
            docs_dict[str(j["id"])] = j["text"]
        except:
            continue

# =============================================
# ✳️ فلترة qrels و queries بناء على الوثائق المتاحة
# =============================================
available_doc_ids = set(docs_dict.keys())
filtered_qrels = {
    qid: {docid for docid in docids if docid in available_doc_ids}
    for qid, docids in qrels.items()
}
filtered_qrels = {qid: docids for qid, docids in filtered_qrels.items() if docids}
filtered_queries = {qid: queries[qid] for qid in filtered_qrels if qid in queries}

qrels = filtered_qrels
queries = filtered_queries

# أخذ أول 5000 استعلام فقط (يمكن تعديل العدد حسب الحاجة)
sample_queries = dict(list(queries.items())[:10000])


# =============================================
# 🔍 دوال البحث الأصلية
# =============================================
def search_tfidf_with_inverted_index(query, inverted_index_data, tfidf_vectorizer, tfidf_matrix, doc_ids, docs_dict, top_k=10, candidate_size=100):
    cleaned_query = advanced_preprocess(query)
    query_terms = cleaned_query.split()
    if not query_terms:
        return []

    doc_scores = {}
    for term in query_terms:
        if term in inverted_index_data["inverted_index"]:
            postings = inverted_index_data["inverted_index"][term]
            for doc_id, score in postings:
                doc_scores[doc_id] = doc_scores.get(doc_id, 0) + score

    candidate_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)[:candidate_size]
    doc_id_to_index = {doc_id: idx for idx, doc_id in enumerate(doc_ids)}
    candidate_indices = [doc_id_to_index[doc_id] for doc_id, _ in candidate_docs if doc_id in doc_id_to_index]

    if not candidate_indices:
        return []

    candidate_tfidf_matrix = tfidf_matrix[candidate_indices]
    query_vector = tfidf_vectorizer.transform([cleaned_query])
    cosine_scores = cosine_similarity(query_vector, candidate_tfidf_matrix).flatten()
    top_indices = cosine_scores.argsort()[::-1][:top_k]

    results = []
    for idx in top_indices:
        doc_idx = candidate_indices[idx]
        doc_id = doc_ids[doc_idx]
        doc_text = docs_dict.get(doc_id, "")
        score = cosine_scores[idx]
        results.append((doc_id, doc_text, score))
    return results

def search_bert(query, top_k=10):
    query_embedding = bert_model.encode([query])
    bert_scores = cosine_similarity(query_embedding, bert_embeddings).flatten()
    top_indices = np.argsort(bert_scores)[::-1][:top_k]
    results = [(bert_doc_ids[i], docs_dict.get(bert_doc_ids[i], ""), bert_scores[i]) for i in top_indices]
    return results

def search_hybrid(query, tfidf_weight=0.5, bert_weight=0.5, top_k=10):
    tfidf_scores = cosine_similarity(tfidf_vectorizer.transform([advanced_preprocess(query)]), tfidf_matrix).flatten()
    bert_scores = cosine_similarity(bert_model.encode([query]), bert_embeddings).flatten()
    combined_scores = tfidf_weight * tfidf_scores + bert_weight * bert_scores
    top_indices = np.argsort(combined_scores)[::-1][:top_k]
    results = [(tfidf_doc_ids[i], docs_dict.get(tfidf_doc_ids[i], ""), combined_scores[i]) for i in top_indices]
    return results

# =============================================
# 🧠 تغليف بالكاش
# =============================================
@memory.cache
def cached_search_tfidf(query, top_k=10, candidate_size=100):
    return search_tfidf_with_inverted_index(query, inverted_index_data, tfidf_vectorizer, tfidf_matrix, tfidf_doc_ids, docs_dict, top_k, candidate_size)

@memory.cache
def cached_search_bert(query, top_k=10):
    return search_bert(query, top_k)

@memory.cache
def cached_search_hybrid(query, tfidf_weight=0.5, bert_weight=0.5, top_k=10):
    return search_hybrid(query, tfidf_weight, bert_weight, top_k)

# =============================================
# 📁 دوال كتابة run و qrel
# =============================================
def write_qrel_file(qrels, filepath):
    with open(filepath, "w") as f:
        for qid, docids in qrels.items():
            for docid in docids:
                f.write(f"{qid} 0 {docid} 1\n")


def write_run_file_threaded(search_fn, queries, run_name, filepath, top_k=10, max_workers=8):
    with open(filepath, "w") as f:
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = {
                executor.submit(search_fn, query, top_k=top_k): qid
                for qid, query in queries.items()
            }
            for future in tqdm(as_completed(futures), total=len(futures), desc=f"Running {run_name}"):
                qid = futures[future]
                try:
                    results = future.result()
                    for rank, (doc_id, _, score) in enumerate(results, start=1):
                        f.write(f"{qid} Q0 {doc_id} {rank} {score} {run_name}\n")
                except Exception as e:
                    print(f"⚠️ Error in query {qid}: {e}")

# =============================================
# 📈 التقييم
# =============================================
qrel_path = "filtered_beir.qrel"
run_tfidf_path = "run_tfidf.txt"
run_bert_path = "run_bert.txt"
run_hybrid_path = "run_hybrid.txt"

write_qrel_file(qrels, qrel_path)
write_run_file_threaded(cached_search_tfidf, sample_queries, "TFIDF", run_tfidf_path, top_k=10)
write_run_file_threaded(cached_search_bert, sample_queries, "BERT", run_bert_path, top_k=10)
write_run_file_threaded(lambda q, top_k=10: cached_search_hybrid(q, tfidf_weight=0.4, bert_weight=0.6, top_k=top_k), sample_queries, "Hybrid", run_hybrid_path, top_k=10)

qrel = TrecQrel(qrel_path)
runs = {
    "TFIDF": TrecRun(run_tfidf_path),
    "BERT": TrecRun(run_bert_path),
    "Hybrid": TrecRun(run_hybrid_path),
}

results_table = []

for model_name, run in runs.items():
    evaluation = TrecEval(run, qrel)
    model_results = {
        "Model": model_name,
        "MAP": evaluation.get_map(),
        "MRR": evaluation.get_reciprocal_rank(),
        "P@10": evaluation.get_precision(10),
        "Recall": evaluation.get_recall(1000)
    }
    results_table.append(model_results)

print("\n📊 Evaluation Results:")
print(tabulate(results_table, headers="keys", tablefmt="fancy_grid", floatfmt=".4f"))

# حذف ملفات run و qrel بعد الانتهاء (اختياري)
for path in [qrel_path, run_tfidf_path, run_bert_path, run_hybrid_path]:
    try:
        os.remove(path)
    except OSError as e:
        print(f"⚠️ فشل حذف {path}: {e}")

  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs,


📊 Evaluation Results:
╒═════════╤════════╤════════╤════════╤══════════╕
│ Model   │    MAP │    MRR │   P@10 │   Recall │
╞═════════╪════════╪════════╪════════╪══════════╡
│ TFIDF   │ 0.1818 │ 0.2007 │ 0.0393 │   0.3020 │
├─────────┼────────┼────────┼────────┼──────────┤
│ BERT    │ 0.8264 │ 0.8578 │ 0.1327 │   0.9443 │
├─────────┼────────┼────────┼────────┼──────────┤
│ Hybrid  │ 0.6958 │ 0.7315 │ 0.1207 │   0.8810 │
╘═════════╧════════╧════════╧════════╧══════════╛


In [None]:
# =============================================
# 📦 المكتبات المطلوبة
# =============================================
import ir_datasets
import numpy as np
import os
import joblib
import json
import re
import html
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from trectools import TrecQrel, TrecRun, TrecEval 
from tabulate import tabulate
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from joblib import Memory
import sys
sys.path.append("..")
from services.vector_store import VectorStore  
from services.documents_service import preprocess
# =============================================
# ⚙️ تهيئة الكاش
# =============================================
memory = Memory(location='./cache', verbose=0)

# =============================================
# ⚙️ تحميل بيانات الكويري من ملف JSON (معدل لملف JSON خطي)
# =============================================

queries_path = r"C:\Users\Mohammad Mihdi\.ir_datasets\beir\quora\queries.json"  # عدل هذا المسار إلى ملف الكويري لديك

queries = {}
with open(queries_path, 'r', encoding='utf-8') as f:
    for line in f:
        try:
            obj = json.loads(line)
            qid = obj["_id"]
            text = obj["text"]
            queries[qid] = text
        except Exception as e:
            print(f"⚠️ خطأ في قراءة السطر: {e}")

# =============================================
# ⚙️ تحميل qrels من ملف test.qrels
# =============================================

dataset = ir_datasets.load("beir/quora/test")
qrels = {}
for qrel in dataset.qrels_iter():
    if qrel.relevance > 0:
        qrels.setdefault(qrel.query_id, set()).add(qrel.doc_id)


# =============================================
# 🧼 دالة التنظيف
# =============================================
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")

def advanced_preprocess(text):
    text = html.unescape(text)
    text = ''.join(c for c in text if c.isprintable())
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'(.)\1{2,}', r'\1', text)
    text = re.sub(r'\s+', ' ', text).strip()
    words = [stemmer.stem(w) for w in text.split() if w not in stop_words and len(w) > 2]
    return ' '.join(words)

# =============================================
# تحميل ملفات التمثيلات
# =============================================
tfidf_doc_ids = joblib.load(r"../data/beir_quora_test/index/TFIDF/doc_ids_beir_quora_test.joblib")
tfidf_matrix = joblib.load(r"../data/beir_quora_test/index/TFIDF/tfidf_matrix_beir_quora_test.joblib")
tfidf_vectorizer = joblib.load(r"../data/beir_quora_test/index/TFIDF/tfidf_vectorizer_beir_quora_test.joblib")
inverted_index_data = joblib.load(r"../data/beir_quora_test/index/TFIDF/tfidf_inverted_index.joblib")

bert_embeddings = np.load(r"../data/beir_quora_test/index/bert/bert_embeddings.npy")
bert_doc_ids = joblib.load(r"../data/beir_quora_test/index/bert/doc_ids.joblib")
vector_store_path = r"../data/beir_quora_test/index/vector_store"
vector_store = VectorStore.load(vector_store_path)

bert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

docs_dict = {}
with open(r"../data/beir_quora_test/raw/raw_beir_quora_test.json", "r", encoding="utf-8") as f:
    for line in f:
        try:
            j = json.loads(line)
            docs_dict[str(j["id"])] = j["text"]
        except:
            continue

# =============================================
# ✳️ فلترة qrels و queries بناء على الوثائق المتاحة
# =============================================
available_doc_ids = set(docs_dict.keys())
filtered_qrels = {
    qid: {docid for docid in docids if docid in available_doc_ids}
    for qid, docids in qrels.items()
}
filtered_qrels = {qid: docids for qid, docids in filtered_qrels.items() if docids}
filtered_queries = {qid: queries[qid] for qid in filtered_qrels if qid in queries}

qrels = filtered_qrels
queries = filtered_queries

# أخذ أول 5000 استعلام فقط (يمكن تعديل العدد حسب الحاجة)
sample_queries = dict(list(queries.items())[:10000])


# =============================================
# 🔍 دوال البحث الأصلية
# =============================================
def search_tfidf_with_inverted_index(query, inverted_index_data, tfidf_vectorizer, tfidf_matrix, doc_ids, docs_dict, top_k=10, candidate_size=100):
    cleaned_query = advanced_preprocess(query)
    query_terms = cleaned_query.split()
    if not query_terms:
        return []

    doc_scores = {}
    for term in query_terms:
        if term in inverted_index_data["inverted_index"]:
            postings = inverted_index_data["inverted_index"][term]
            for doc_id, score in postings:
                doc_scores[doc_id] = doc_scores.get(doc_id, 0) + score

    candidate_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)[:candidate_size]
    doc_id_to_index = {doc_id: idx for idx, doc_id in enumerate(doc_ids)}
    candidate_indices = [doc_id_to_index[doc_id] for doc_id, _ in candidate_docs if doc_id in doc_id_to_index]

    if not candidate_indices:
        return []

    candidate_tfidf_matrix = tfidf_matrix[candidate_indices]
    query_vector = tfidf_vectorizer.transform([cleaned_query])
    cosine_scores = cosine_similarity(query_vector, candidate_tfidf_matrix).flatten()
    top_indices = cosine_scores.argsort()[::-1][:top_k]

    results = []
    for idx in top_indices:
        doc_idx = candidate_indices[idx]
        doc_id = doc_ids[doc_idx]
        doc_text = docs_dict.get(doc_id, "")
        score = cosine_scores[idx]
        results.append((doc_id, doc_text, score))
    return results

def search_bert(query, top_k=10):
     query_embedding = bert_model.encode([query],normalize_embeddings=True).astype(np.float32)
     return vector_store.search(query_embedding, top_k=top_k)

def search_hybrid(query, tfidf_weight=0.5, bert_weight=0.5, top_k=10):
    tfidf_scores = cosine_similarity(tfidf_vectorizer.transform([advanced_preprocess(query)]), tfidf_matrix).flatten()
    query_embedding = bert_model.encode([query], normalize_embeddings=True).astype(np.float32)
    # Initialize empty BERT scores
    bert_scores = np.zeros_like(tfidf_scores)
    top_bert_results = vector_store.search(query_embedding, top_k=top_k * 20)
        # أنشئ mapping أسرع
    doc_id_to_index = {doc_id: idx for idx, doc_id in enumerate(bert_doc_ids)}

    for doc_id, _, score in top_bert_results:
        idx = doc_id_to_index.get(doc_id)
        if idx is not None:
           bert_scores[idx] = score
            
    if tfidf_doc_ids != bert_doc_ids:
        raise ValueError("قوائم doc_ids غير متطابقة بين النموذجين!")
        
    combined_scores = tfidf_weight * tfidf_scores + bert_weight * bert_scores
    top_indices = np.argsort(combined_scores)[::-1][:top_k]
    results = [(tfidf_doc_ids[i], docs_dict.get(tfidf_doc_ids[i], ""), combined_scores[i]) for i in top_indices]
    return results

# =============================================
# 🧠 تغليف بالكاش
# =============================================
@memory.cache
def cached_search_tfidf(query, top_k=10, candidate_size=100):
    return search_tfidf_with_inverted_index(query, inverted_index_data, tfidf_vectorizer, tfidf_matrix, tfidf_doc_ids, docs_dict, top_k, candidate_size)

@memory.cache
def cached_search_bert(query, top_k=10):
    return search_bert(query, top_k)

@memory.cache
def cached_search_hybrid(query, tfidf_weight=0.5, bert_weight=0.5, top_k=10):
    return search_hybrid(query, tfidf_weight, bert_weight, top_k)

# =============================================
# 📁 دوال كتابة run و qrel
# =============================================
def write_qrel_file(qrels, filepath):
    with open(filepath, "w") as f:
        for qid, docids in qrels.items():
            for docid in docids:
                f.write(f"{qid} 0 {docid} 1\n")


def write_run_file_threaded(search_fn, queries, run_name, filepath, top_k=10, max_workers=8):
    with open(filepath, "w") as f:
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = {
                executor.submit(search_fn, query, top_k=top_k): qid
                for qid, query in queries.items()
            }
            for future in tqdm(as_completed(futures), total=len(futures), desc=f"Running {run_name}"):
                qid = futures[future]
                try:
                    results = future.result()
                    for rank, (doc_id, _, score) in enumerate(results, start=1):
                        f.write(f"{qid} Q0 {doc_id} {rank} {score} {run_name}\n")
                except Exception as e:
                    print(f"⚠️ Error in query {qid}: {e}")

# =============================================
# 📈 التقييم
# =============================================
qrel_path = "filtered_beir.qrel"
run_tfidf_path = "run_tfidf.txt"
run_bert_path = "run_bert.txt"
run_hybrid_path = "run_hybrid.txt"

write_qrel_file(qrels, qrel_path)
write_run_file_threaded(cached_search_tfidf, sample_queries, "TFIDF", run_tfidf_path, top_k=10)
write_run_file_threaded(cached_search_bert, sample_queries, "BERT", run_bert_path, top_k=10)
write_run_file_threaded(lambda q, top_k=10: cached_search_hybrid(q, tfidf_weight=0.4, bert_weight=0.6, top_k=top_k), sample_queries, "Hybrid", run_hybrid_path, top_k=10)

qrel = TrecQrel(qrel_path)
runs = {
    "TFIDF": TrecRun(run_tfidf_path),
    "BERT": TrecRun(run_bert_path),
    "Hybrid": TrecRun(run_hybrid_path),
}

results_table = []

for model_name, run in runs.items():
    evaluation = TrecEval(run, qrel)
    model_results = {
        "Model": model_name,
        "MAP": evaluation.get_map(),
        "MRR": evaluation.get_reciprocal_rank(),
        "P@10": evaluation.get_precision(10),
        "Recall": evaluation.get_recall(1000)
    }
    results_table.append(model_results)

print("\n📊 Evaluation Results with Features:")
print(tabulate(results_table, headers="keys", tablefmt="fancy_grid", floatfmt=".4f"))

# حذف ملفات run و qrel بعد الانتهاء (اختياري)
for path in [qrel_path, run_tfidf_path, run_bert_path, run_hybrid_path]:
    try:
        os.remove(path)
    except OSError as e:
        print(f"⚠️ فشل حذف {path}: {e}")

  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs, shelving=False)[0]
  return self._cached_call(args, kwargs,


📊 Evaluation Results with Features:
╒═════════╤════════╤════════╤════════╤══════════╕
│ Model   │    MAP │    MRR │   P@10 │   Recall │
╞═════════╪════════╪════════╪════════╪══════════╡
│ TFIDF   │ 0.1721 │ 0.2019 │ 0.0433 │   0.2909 │
├─────────┼────────┼────────┼────────┼──────────┤
│ BERT    │ 0.8035 │ 0.8493 │ 0.1507 │   0.9283 │
├─────────┼────────┼────────┼────────┼──────────┤
│ Hybrid  │ 0.6738 │ 0.7269 │ 0.1356 │   0.8655 │
╘═════════╧════════╧════════╧════════╧══════════╛
