In [1]:
import pathlib
import sys
import joblib
import os
import pandas as pd
import requests
import json
import matplotlib.pyplot as plt
import time
from datetime import datetime
from tqdm import tqdm


In [2]:
from preprocess.preprocess import PreprocessService

preprocess_service = PreprocessService()


In [3]:
from tfidf.tf_idf_offline import TfidfOffline

tfidf_offline = TfidfOffline()


files\antique
files\corpus
✅ تم تحميل Vectorizer من: files\antique\vectorizer.joblib
✅ تم تحميل TF-IDF matrix من: files\antique\tfidf_matrix.joblib
✅ تم تحميل Vectorizer من: files\corpus\vectorizer.joblib
✅ تم تحميل TF-IDF matrix من: files\corpus\tfidf_matrix.joblib
(401404, 100000)
Preloading successful


In [5]:
from embedding.embedding_offline import EmbeddingOffline

embedding_offline = EmbeddingOffline()


🔍 Loading antique embeddings from: files\antique\bert_embeddings.npy
✅ Antique embeddings loaded, shape: (401404, 384)
🔍 Loading corpus embeddings from: files\corpus\bert_embeddings.npy
✅ Corpus embeddings loaded, shape: (380815, 384)
✅ Preloading successful
🔍 Loading Chroma collections...
✅ Antique Chroma collection loaded successfully
✅ Corpus Chroma collection loaded successfully
✅ Chroma embeddings loaded successfully


In [6]:
from Ranking.ranking_offline import RankingOffline

ranking_offline = RankingOffline()


✅ Chroma embeddings loaded successfully


In [7]:
import scipy.sparse as sp
import pandas as pd

def docs_ids_search(query, dataset, top_k, method):
    try:
        # Step 1: Process the text
        processed_tokens = preprocess_service.preprocess_text(query)
        
        if not processed_tokens:
            return {"error": "Processed tokens not returned from text processing service.", "success": False}

        # Step 2: Vectorize the query for both methods
        if method == "hybrid":
            # Generate TF-IDF vector
            tfidf_vector_data = tfidf_offline.vectorize_query(processed_tokens, dataset)
            tfidf_vector = sp.csr_matrix(
                (tfidf_vector_data["data"], tfidf_vector_data["indices"], tfidf_vector_data["indptr"]),
                shape=tfidf_vector_data["shape"]
            )
            
            # Generate Embedding vector
            embedding_vector = embedding_offline.vectorize_query(processed_tokens, dataset)
            
            # Check if vectors are valid
            if tfidf_vector is None or tfidf_vector.shape[0] == 0:
                return {"error": "TF-IDF vector not returned from vectorization service.", "success": False}
            
            if embedding_vector is None or (hasattr(embedding_vector, 'size') and embedding_vector.size == 0):
                return {"error": "Embedding vector not returned from vectorization service.", "success": False}
            
            # Step 3: Calculate similarities for both methods
            tfidf_similarities = tfidf_offline.calculate_similarity(tfidf_vector, dataset)
            embedding_similarities = embedding_offline.calculate_similarity_embedding(embedding_vector, dataset)
            
            if tfidf_similarities is None:
                return {"error": "TF-IDF similarities not returned from similarity calculation service.", "success": False}
            
            if embedding_similarities is None:
                return {"error": "Embedding similarities not returned from similarity calculation service.", "success": False}
            
            # Step 4: Load document data from CSV files
            if dataset == "antique":
                doc_data = pd.read_csv("antique_clean_data.csv")
            else:
                doc_data = pd.read_csv("corpus_clean_data.csv")
            
            # Step 5: Convert similarities to dict format with doc_ids
            tfidf_dict = {}
            embedding_dict = {}
            
            for idx, (_, row) in enumerate(doc_data.iterrows()):
                doc_id = str(row["doc_id"])
                if idx < len(tfidf_similarities):
                    tfidf_dict[doc_id] = tfidf_similarities[idx]
                if idx < len(embedding_similarities):
                    embedding_dict[doc_id] = embedding_similarities[idx]
            
            # Step 6: Combine similarities using weighted average
            all_doc_ids = set(tfidf_dict.keys()).union(embedding_dict.keys())
            
            combined_scores = []
            for doc_id in all_doc_ids:
                tfidf_score = tfidf_dict.get(doc_id, 0)
                embed_score = embedding_dict.get(doc_id, 0)
                # يمكنك تعديل الأوزان هنا (حالياً متساوية)
                avg_score = (tfidf_score + embed_score) / 2
                combined_scores.append({"doc_id": doc_id, "score": avg_score})
            
            # Step 7: Sort results and take top_k
            combined_scores.sort(key=lambda x: x["score"], reverse=True)
            result_docs = combined_scores[:top_k]
            
        else:
            # Original logic for single methods
            if method == "tfidf":
                query_vector_data = tfidf_offline.vectorize_query(processed_tokens, dataset)
                query_vector = sp.csr_matrix(
                    (query_vector_data["data"], query_vector_data["indices"], query_vector_data["indptr"]),
                    shape=query_vector_data["shape"]
                )
            elif method in ("embedding", "chroma"):
                query_vector = embedding_offline.vectorize_query(processed_tokens, dataset)
            else:
                return {"error": "Invalid method.", "success": False}

            # Check if query_vector is valid
            if method == "tfidf":
                if query_vector is None or query_vector.shape[0] == 0:
                    return {"error": "Query vector not returned from vectorization service.", "success": False}
            else:
                if query_vector is None or (hasattr(query_vector, 'size') and query_vector.size == 0):
                    return {"error": "Query vector not returned from vectorization service.", "success": False}

            if method == "chroma":
                result_docs = embedding_offline.calculate_similarity_chroma(query_vector, dataset, top_k)
                if not result_docs:
                    return {"error": "Result documents not returned from chroma similarity service.", "success": False}
            else:
                if method == "tfidf":
                    similarities = tfidf_offline.calculate_similarity(query_vector, dataset)
                elif method == "embedding":
                    similarities = embedding_offline.calculate_similarity_embedding(query_vector, dataset)
                else:
                    return {"error": "Invalid method.", "success": False}

                if similarities is None:
                    return {"error": "Similarities not returned from similarity calculation service.", "success": False}

                result_response = ranking_offline.rank_documents(similarities, processed_tokens, dataset, top_k, method)
                print(result_response)
                result_docs = result_response.get('result_docs')

        if not result_docs:
            return {"error": "Result documents not returned from ranking service.", "success": False}

    except Exception as e:
        return {"error": f"Unexpected Error: {str(e)}", "success": False}
    
    return {"result_docs": result_docs, "success": True}

In [7]:
with open('antique_queries.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

queries = []
for line in lines:
    if line.strip():  
        parts = line.strip().split('\t')
        if len(parts) == 2:
            query_id = parts[0]
            query_text = parts[1]
            queries.append({'query_id': query_id, 'query_text': query_text})

# تحويل إلى DataFrame
import pandas as pd
queries_df = pd.DataFrame(queries)

# عرض النتائج
print(f"تم قراءة {len(queries_df)} استعلام")
print("\nأول 5 استعلامات:")
print(queries_df.head())

print("\nآخر 5 استعلامات:")
print(queries_df.tail())

تم قراءة 200 استعلام

أول 5 استعلامات:
  query_id                                         query_text
0  3990512          how can we get concentration onsomething?
1   714612  Why doesn't the water fall off  earth if it's ...
2  2528767  How do I determine the charge of the iron ion ...
3   821387     I have mice.How do I get rid of them humanely?
4  1880028  What does "see Leaflet" mean on Ept Pregnancy ...

آخر 5 استعلامات:
    query_id                                         query_text
195  2192891      how are braces put on and do they hurt a lot?
196  4406669                    What do you order at Taco Bell?
197  1582877  why do we go to school if in the real world we...
198  1340574  Why do some people only go to church on Easter...
199  1971899                              what is masturbat***?


In [8]:
def read_questions_from_jsonl(file_path):
    """
    قراءة الأسئلة وID الخاصة بها من ملف JSONL
    """
    questions = []
    
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if line:  # تجاهل الأسطر الفارغة
                try:
                    data = json.loads(line)
                    question_id = data.get('_id', '')
                    question_text = data.get('text', '')
                    
                    questions.append({
                        'query_id': question_id,
                        'query_text': question_text
                    })
                except json.JSONDecodeError as e:
                    print(f"خطأ في تحليل JSON: {e}")
                    continue
    
    return questions

def display_questions(questions):
    """
    عرض الأسئلة وID الخاصة بها
    """
    print("قائمة الأسئلة مع ID الخاصة بها:")
    print("=" * 50)
    
    for i, question in enumerate(questions, 1):
        print(f"{i:2d}. ID: {question['query_id']}")
        print(f"    السؤال: {question['query_text']}")
        print("-" * 50)

# قراءة الأسئلة من الملف
file_path = "corpus_queries.jsonl"
questions = read_questions_from_jsonl(file_path)

# عرض الأسئلة
display_questions(questions)

# إحصائيات
print(f"\nإجمالي عدد الأسئلة: {len(questions)}")

queries_df = pd.DataFrame(questions)

قائمة الأسئلة مع ID الخاصة بها:
 1. ID: 1
    السؤال: Should teachers get tenure?
--------------------------------------------------
 2. ID: 2
    السؤال: Is vaping with e-cigarettes safe?
--------------------------------------------------
 3. ID: 3
    السؤال: Should insider trading be allowed?
--------------------------------------------------
 4. ID: 4
    السؤال: Should corporal punishment be used in schools?
--------------------------------------------------
 5. ID: 5
    السؤال: Should social security be privatized?
--------------------------------------------------
 6. ID: 6
    السؤال: Is a college education worth it?
--------------------------------------------------
 7. ID: 7
    السؤال: Should felons who have completed their sentence be allowed to vote?
--------------------------------------------------
 8. ID: 8
    السؤال: Should abortion be legal?
--------------------------------------------------
 9. ID: 9
    السؤال: Should students have to wear school uniforms?
-------

In [None]:
def process_all_queries(queries_df, dataset="antique", top_k=10, method="hybrid"):
    """
    معالجة جميع الاستعلامات وحفظ النتائج مع شريط التقدم
    """
    results = {}
    total_queries = len(queries_df)
    
    print(f"�� بدء معالجة {total_queries} استعلام باستخدام {method.upper()}")
    print(f"📊 Dataset: {dataset}, Top-K: {top_k}")
    print("=" * 60)
    
    # إنشاء شريط التقدم مع tqdm
    progress_bar = tqdm(
        queries_df.iterrows(), 
        total=total_queries,
        desc=f"معالجة الاستعلامات ({method})",
        unit="استعلام",
        bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]'
    )
    
    for idx, row in progress_bar:
        query_id = row['query_id']
        query_text = row['query_text']
        
        # تحديث وصف شريط التقدم
        progress_bar.set_description(f"معالجة: {query_id}")
        progress_bar.set_postfix({
            'النص': query_text[:30] + "..." if len(query_text) > 30 else query_text
        })
        
        try:
            # استدعاء دالة البحث
            search_result = docs_ids_search(query_text, dataset, top_k, method)
            
            if search_result['success']:
                result_docs = search_result['result_docs']
                
                # حفظ النتائج
                results[query_id] = {
                    'query_text': query_text,
                    'method': method,
                    'dataset': dataset,
                    'top_k': top_k,
                    'success': True,
                    'result_docs': result_docs,
                    'timestamp': datetime.now().isoformat()
                }
            else:
                results[query_id] = {
                    'query_text': query_text,
                    'method': method,
                    'dataset': dataset,
                    'top_k': top_k,
                    'success': False,
                    'error': search_result['error'],
                    'timestamp': datetime.now().isoformat()
                }
                
        except Exception as e:
            results[query_id] = {
                'query_text': query_text,
                'method': method,
                'dataset': dataset,
                'top_k': top_k,
                'success': False,
                'error': str(e),
                'timestamp': datetime.now().isoformat()
            }
    
    # إغلاق شريط التقدم
    progress_bar.close()
    
    # حساب الإحصائيات النهائية
    successful_queries = sum(1 for r in results.values() if r['success'])
    failed_queries = total_queries - successful_queries
    
    print("\n" + "=" * 60)
    print("🎉 انتهت معالجة جميع الاستعلامات!")
    print(f"📊 الإحصائيات النهائية:")
    print(f"   إجمالي الاستعلامات: {total_queries}")
    print(f"   الاستعلامات الناجحة: {successful_queries}")
    print(f"   الاستعلامات الفاشلة: {failed_queries}")
    print(f"   نسبة النجاح: {(successful_queries/total_queries)*100:.1f}%")
    
    return results, {
        'total_queries': total_queries,
        'successful_queries': successful_queries,
        'failed_queries': failed_queries,
        'success_rate': successful_queries/total_queries,
        'method': method,
        'dataset': dataset,
        'top_k': top_k,
        'timestamp': datetime.now().isoformat()
    }

def save_results(results, stats, method="tfidf", dataset="antique"):
    """
    حفظ النتائج في ملف JSON
    """
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"search_results_{method}_{dataset}_{timestamp}.json"
    
    output_data = {
        'metadata': {
            'method': method,
            'dataset': dataset,
            'timestamp': datetime.now().isoformat(),
            'stats': stats
        },
        'results': results
    }
    
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(output_data, f, ensure_ascii=False, indent=2)
        print(f"💾 تم حفظ النتائج في الملف: {filename}")
        return filename
    except Exception as e:
        print(f"❌ خطأ في حفظ الملف: {e}")
        return None

# تشغيل المعالجة
print("🔍 بدء معالجة جميع الاستعلامات...")

# يمكنك تغيير المعاملات حسب الحاجة
method = "hybrid"  # أو "embedding" أو "chroma"
dataset = "antique"
top_k = 10

# معالجة جميع الاستعلامات
results, stats = process_all_queries(queries_df, dataset, top_k, method)

# حفظ النتائج
saved_file = save_results(results, stats, method, dataset)

print(f"\n✅ تم الانتهاء من جميع العمليات!")
if saved_file:
    print(f"📁 الملف المحفوظ: {saved_file}")

In [None]:
def process_all_queries(queries_df, dataset="antique", top_k=10, method="hybrid"):
    """
    معالجة جميع الاستعلامات وحفظ النتائج مع شريط التقدم
    """
    results = {}
    total_queries = len(queries_df)
    
    print(f"�� بدء معالجة {total_queries} استعلام باستخدام {method.upper()}")
    print(f"📊 Dataset: {dataset}, Top-K: {top_k}")
    print("=" * 60)
    
    # إنشاء شريط التقدم مع tqdm
    progress_bar = tqdm(
        queries_df.iterrows(), 
        total=total_queries,
        desc=f"معالجة الاستعلامات ({method})",
        unit="استعلام",
        bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]'
    )
    
    for idx, row in progress_bar:
        query_id = row['query_id']
        query_text = row['query_text']
        
        # تحديث وصف شريط التقدم
        progress_bar.set_description(f"معالجة: {query_id}")
        progress_bar.set_postfix({
            'النص': query_text[:30] + "..." if len(query_text) > 30 else query_text
        })
        
        try:
            # استدعاء دالة البحث
            search_result = docs_ids_search(query_text, dataset, top_k, method)
            
            if search_result['success']:
                result_docs = search_result['result_docs']
                
                # حفظ النتائج
                results[query_id] = {
                    'query_text': query_text,
                    'method': method,
                    'dataset': dataset,
                    'top_k': top_k,
                    'success': True,
                    'result_docs': result_docs,
                    'timestamp': datetime.now().isoformat()
                }
            else:
                results[query_id] = {
                    'query_text': query_text,
                    'method': method,
                    'dataset': dataset,
                    'top_k': top_k,
                    'success': False,
                    'error': search_result['error'],
                    'timestamp': datetime.now().isoformat()
                }
                
        except Exception as e:
            results[query_id] = {
                'query_text': query_text,
                'method': method,
                'dataset': dataset,
                'top_k': top_k,
                'success': False,
                'error': str(e),
                'timestamp': datetime.now().isoformat()
            }
    
    # إغلاق شريط التقدم
    progress_bar.close()
    
    # حساب الإحصائيات النهائية
    successful_queries = sum(1 for r in results.values() if r['success'])
    failed_queries = total_queries - successful_queries
    
    print("\n" + "=" * 60)
    print("🎉 انتهت معالجة جميع الاستعلامات!")
    print(f"📊 الإحصائيات النهائية:")
    print(f"   إجمالي الاستعلامات: {total_queries}")
    print(f"   الاستعلامات الناجحة: {successful_queries}")
    print(f"   الاستعلامات الفاشلة: {failed_queries}")
    print(f"   نسبة النجاح: {(successful_queries/total_queries)*100:.1f}%")
    
    return results, {
        'total_queries': total_queries,
        'successful_queries': successful_queries,
        'failed_queries': failed_queries,
        'success_rate': successful_queries/total_queries,
        'method': method,
        'dataset': dataset,
        'top_k': top_k,
        'timestamp': datetime.now().isoformat()
    }

def save_results(results, stats, method="tfidf", dataset="antique"):
    """
    حفظ النتائج في ملف JSON
    """
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"search_results_{method}_{dataset}_{timestamp}.json"
    
    output_data = {
        'metadata': {
            'method': method,
            'dataset': dataset,
            'timestamp': datetime.now().isoformat(),
            'stats': stats
        },
        'results': results
    }
    
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(output_data, f, ensure_ascii=False, indent=2)
        print(f"💾 تم حفظ النتائج في الملف: {filename}")
        return filename
    except Exception as e:
        print(f"❌ خطأ في حفظ الملف: {e}")
        return None

# تشغيل المعالجة
print("🔍 بدء معالجة جميع الاستعلامات...")

# يمكنك تغيير المعاملات حسب الحاجة
method = "hybrid"  # أو "embedding" أو "chroma"
dataset = "corpus"
top_k = 10

# معالجة جميع الاستعلامات
results, stats = process_all_queries(queries_df, dataset, top_k, method)

# حفظ النتائج
saved_file = save_results(results, stats, method, dataset)

print(f"\n✅ تم الانتهاء من جميع العمليات!")
if saved_file:
    print(f"📁 الملف المحفوظ: {saved_file}")

In [2]:

# قراءة ملف antique_qrels.tsv
def read_qrels_file(file_path):
    """
    قراءة ملف qrels (relevance judgments) وتنظيمه
    """
    qrels = {}
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                parts = line.split()
                if len(parts) >= 4:
                    query_id = parts[0]
                    doc_id = parts[2]
                    relevance_score = int(parts[3])
                    
                    if query_id not in qrels:
                        qrels[query_id] = {}
                    
                    qrels[query_id][doc_id] = relevance_score
    
    return qrels

# قراءة ملف qrels
qrels_path = 'antique_qrels.tsv'
qrels_data = read_qrels_file(qrels_path)

print(f"عدد الاستعلامات في ملف qrels: {len(qrels_data)}")

# عرض مثال على البيانات
if qrels_data:
    first_query = list(qrels_data.keys())[0]
    print(f"\nمثال على الاستعلام الأول ({first_query}):")
    print(f"عدد الوثائق ذات الصلة: {len(qrels_data[first_query])}")
    
    # عرض أول 5 وثائق مع درجات الصلة
    for i, (doc_id, score) in enumerate(list(qrels_data[first_query].items())[:5]):
        print(f"  {doc_id}: {score}")
    
    # إحصائيات عامة
    all_scores = []
    for query_scores in qrels_data.values():
        all_scores.extend(query_scores.values())
    
    print(f"\nإحصائيات درجات الصلة:")
    print(f"  المتوسط: {sum(all_scores) / len(all_scores):.2f}")
    print(f"  الحد الأقصى: {max(all_scores)}")
    print(f"  الحد الأدنى: {min(all_scores)}")
    print(f"  التوزيع: {dict(pd.Series(all_scores).value_counts().sort_index())}")

# تحويل البيانات إلى DataFrame للتحليل
qrels_list = []
for query_id, docs in qrels_data.items():
    for doc_id, score in docs.items():
        qrels_list.append({
            'query_id':   query_id,
            'doc_id':"antique_" + doc_id,
            'relevance_score': score
        })

qrels_df = pd.DataFrame(qrels_list)
print(f"\nDataFrame للتحليل:")
print(qrels_df.head(10))
print(f"\nأبعاد DataFrame: {qrels_df.shape}")

عدد الاستعلامات في ملف qrels: 200

مثال على الاستعلام الأول (1964316):
عدد الوثائق ذات الصلة: 33
  1964316_5: 4
  1674088_11: 1
  1218838_13: 2
  1519022_15: 2
  3059341_5: 2

إحصائيات درجات الصلة:
  المتوسط: 2.34
  الحد الأقصى: 4
  الحد الأدنى: 1
  التوزيع: {1: 1642, 2: 2417, 3: 1196, 4: 1334}

DataFrame للتحليل:
  query_id              doc_id  relevance_score
0  1964316   antique_1964316_5                4
1  1964316  antique_1674088_11                1
2  1964316  antique_1218838_13                2
3  1964316  antique_1519022_15                2
4  1964316   antique_3059341_5                2
5  1964316   antique_4126855_1                2
6  1964316   antique_2434719_9                2
7  1964316   antique_3786452_1                2
8  1964316   antique_1964316_3                4
9  1964316   antique_1964316_2                4

أبعاد DataFrame: (6589, 3)


In [5]:
def read_qrels_file(file_path):
    """
    قراءة ملف qrels (relevance judgments) وتنظيمه
    """
    qrels = {}
    
    with open(file_path, 'r', encoding='utf-8') as f:
        # تخطي السطر الأول (العناوين)
        next(f)
        
        for line in f:
            line = line.strip()
            if line:
                # استخدام tab كفاصل بدلاً من space
                parts = line.split('\t')
                if len(parts) >= 3:
                    query_id = parts[0]
                    doc_id = parts[1]
                    relevance_score = int(parts[2])
                    
                    if query_id not in qrels:
                        qrels[query_id] = {}
                    
                    qrels[query_id][doc_id] = relevance_score
    
    return qrels

# قراءة ملف qrels
qrels_path = 'corpus_qrels.tsv'
qrels_data = read_qrels_file(qrels_path)

print(f"عدد الاستعلامات في ملف qrels: {len(qrels_data)}")

# عرض مثال على البيانات
if qrels_data:
    first_query = list(qrels_data.keys())[0]
    print(f"\nمثال على الاستعلام الأول ({first_query}):")
    print(f"عدد الوثائق ذات الصلة: {len(qrels_data[first_query])}")
    
    # عرض أول 5 وثائق مع درجات الصلة
    for i, (doc_id, score) in enumerate(list(qrels_data[first_query].items())[:5]):
        print(f"  {doc_id}: {score}")
    
    # إحصائيات عامة
    all_scores = []
    for query_scores in qrels_data.values():
        all_scores.extend(query_scores.values())
    
    print(f"\nإحصائيات درجات الصلة:")
    print(f"  المتوسط: {sum(all_scores) / len(all_scores):.2f}")
    print(f"  الحد الأقصى: {max(all_scores)}")
    print(f"  الحد الأدنى: {min(all_scores)}")
    print(f"  التوزيع: {dict(pd.Series(all_scores).value_counts().sort_index())}")

# تحويل البيانات إلى DataFrame للتحليل
qrels_list = []
for query_id, docs in qrels_data.items():
    for doc_id, score in docs.items():
        qrels_list.append({
            'query_id': query_id,
            'doc_id': "corpus_" + doc_id,  # إزالة "antique_" لأن هذا ملف corpus
            'relevance_score': score
        })

qrels_df = pd.DataFrame(qrels_list)
print(f"\nDataFrame للتحليل:")
print(qrels_df.head(10))
print(f"\nأبعاد DataFrame: {qrels_df.shape}")

عدد الاستعلامات في ملف qrels: 49

مثال على الاستعلام الأول (1):
عدد الوثائق ذات الصلة: 53
  197beaca-2019-04-18T11:28:59Z-00001-000: 4
  1a76ed9f-2019-04-18T16:07:27Z-00001-000: 5
  1a76ed9f-2019-04-18T16:07:27Z-00002-000: 3
  1a76ed9f-2019-04-18T16:07:27Z-00005-000: 4
  1b03f390-2019-04-18T18:42:36Z-00003-000: 3

إحصائيات درجات الصلة:
  المتوسط: 2.49
  الحد الأقصى: 5
  الحد الأدنى: -2
  التوزيع: {-2: 549, 1: 186, 2: 195, 3: 628, 4: 1006, 5: 398}

DataFrame للتحليل:
  query_id                                          doc_id  relevance_score
0        1  corpus_197beaca-2019-04-18T11:28:59Z-00001-000                4
1        1  corpus_1a76ed9f-2019-04-18T16:07:27Z-00001-000                5
2        1  corpus_1a76ed9f-2019-04-18T16:07:27Z-00002-000                3
3        1  corpus_1a76ed9f-2019-04-18T16:07:27Z-00005-000                4
4        1  corpus_1b03f390-2019-04-18T18:42:36Z-00003-000                3
5        1  corpus_1b03f390-2019-04-18T18:42:36Z-00004-000               

In [3]:
class EvaluationMetrics:
    def __init__(self, true_data, predictions):
      
      try:
        print("true_data", true_data)
        print("predictions", predictions)
        # تحويل البيانات إلى نصوص مع معالجة الأخطاء
        self.true_data = {}
        for k, v in true_data.items():
            try:
                key = str(k) if k is not None else ""
                values = [str(item) if item is not None else "" for item in v]
                self.true_data[key] = values
            except Exception as e:
                print(f"خطأ في معالجة true_data للمفتاح {k}: {e}")
                self.true_data[str(k)] = []
        
        self.predictions = {}
        for k, v in predictions.items():
            try:
                key = str(k) if k is not None else ""
                values = [str(item) if item is not None else "" for item in v]
                self.predictions[key] = values
            except Exception as e:
                print(f"خطأ في معالجة predictions للمفتاح {k}: {e}")
                self.predictions[str(k)] = []
                
      except Exception as e:
          print(f"خطأ في تهيئة EvaluationMetrics: {e}")
          self.true_data = {}
          self.predictions = {}
    
    def calculate_recall(self, true_pids, pred_indices):
        true_set = set(true_pids)
        pred_set = set(pred_indices)
        if len(true_set) == 0:
            return 0
        return len(true_set & pred_set) / len(true_set)

    def calculate_precision_at_k(self, true_pids, pred_indices, k):
        true_set = set(true_pids)
        pred_set = set(pred_indices[:k])
        if len(pred_set) == 0:
            return 0
        return len(true_set & pred_set) / k

    def average_precision(self, true_pids, pred_indices):
        relevant = 0
        sum_precisions = 0
        for i, pred in enumerate(pred_indices, 1):
            if pred in true_pids:
                relevant += 1
                sum_precisions += relevant / (i)
        if relevant == 0:
            return 0
        return sum_precisions / relevant

    def mean_reciprocal_rank(self, true_pids, pred_indices):
        for rank, pid in enumerate(pred_indices, start=1):
            if pid in true_pids:
                return 1 / rank
        return 0

    def calculate_metrics(self):
        recalls = []
        precisions_k = []
        aps = []
        mrrs = []

        for query_id, true_ids in self.true_data.items():
            pred_ids = self.predictions.get(query_id, [])
            recalls.append(self.calculate_recall(true_ids, pred_ids))
            precisions_k.append(self.calculate_precision_at_k(true_ids, pred_ids, 10))
            aps.append(self.average_precision(true_ids, pred_ids))
            mrrs.append(self.mean_reciprocal_rank(true_ids, pred_ids))

        mean_recall = sum(recalls) / len(recalls)
        mean_precision_at_k = sum(precisions_k) / len(precisions_k)
        mean_ap = sum(aps) / len(aps)
        mean_mrr = sum(mrrs) / len(mrrs)

        print(f"Mean Recall: {mean_recall:.4f}")
        print(f"Precision@10: {mean_precision_at_k:.4f}")
        print(f"Mean Average Precision: {mean_ap:.4f}")
        print(f"Mean Reciprocal Rank: {mean_mrr:.4f}")

        return mean_recall, mean_precision_at_k, mean_ap, mean_mrr

In [4]:
with open("search_results_hybrid_antique_20250711_230539.json", 'r', encoding='utf-8') as f:
    data = json.load(f)
print("Evaluating lifestyle metrics...")
# تحويل qrels_df إلى dictionary
true_data = {}
for _, row in qrels_df.iterrows():
    query_id = str(row['query_id'])
    doc_id = str(row['doc_id'])
    if query_id not in true_data:
        true_data[query_id] = []
    true_data[query_id].append(doc_id)

# استخراج predictions من data
predictions = {}
for query_id, result in data['results'].items():
    if result['success']:
        # استخراج doc_ids فقط من result_docs
        pred_docs = [doc['doc_id'] for doc in result['result_docs']]
        predictions[query_id] = pred_docs

# الآن استخدام البيانات المحولة
evaluation_hybrid = EvaluationMetrics(true_data, predictions)

mean_recall, mean_precision_at_k, mean_ap, mean_mrr = evaluation_hybrid.calculate_metrics()



Evaluating lifestyle metrics...
true_data {'1964316': ['antique_1964316_5', 'antique_1674088_11', 'antique_1218838_13', 'antique_1519022_15', 'antique_3059341_5', 'antique_4126855_1', 'antique_2434719_9', 'antique_3786452_1', 'antique_1964316_3', 'antique_1964316_2', 'antique_767911_0', 'antique_1964316_0', 'antique_1964316_1', 'antique_1964316_4', 'antique_1248144_1', 'antique_2768257_0', 'antique_1519022_3', 'antique_2245059_0', 'antique_1013722_5', 'antique_650233_14', 'antique_2305171_0', 'antique_3435824_3', 'antique_636973_2', 'antique_1724160_7', 'antique_3592532_6', 'antique_636973_1', 'antique_1148987_10', 'antique_2787567_1', 'antique_647686_0', 'antique_369616_4', 'antique_1759521_19', 'antique_2929011_0', 'antique_1810312_4'], '2418598': ['antique_2418598_0', 'antique_2418598_9', 'antique_537603_10', 'antique_2418598_8', 'antique_2418598_1', 'antique_884445_5', 'antique_2418598_3', 'antique_2418598_2', 'antique_2418598_5', 'antique_2418598_4', 'antique_2418598_7', 'antique_

In [6]:
with open("search_results_hybrid_corpus_20250712_000123.json", 'r', encoding='utf-8') as f:
    data = json.load(f)
print("Evaluating lifestyle metrics...")

# تحويل qrels_df إلى dictionary
true_data = {}
for _, row in qrels_df.iterrows():
    query_id = str(row['query_id'])
    doc_id = str(row['doc_id'])
    if query_id not in true_data:
        true_data[query_id] = []
    true_data[query_id].append(doc_id)

# استخراج predictions من data
predictions = {}
for query_id, result in data['results'].items():
    if result['success']:
        # استخراج doc_ids فقط من result_docs
        pred_docs = [doc['doc_id'] for doc in result['result_docs']]
        predictions[query_id] = pred_docs

# الآن استخدام البيانات المحولة
evaluation_hybrid = EvaluationMetrics(true_data, predictions)

mean_recall, mean_precision_at_k, mean_ap, mean_mrr = evaluation_hybrid.calculate_metrics()



Evaluating lifestyle metrics...
true_data {'1': ['corpus_197beaca-2019-04-18T11:28:59Z-00001-000', 'corpus_1a76ed9f-2019-04-18T16:07:27Z-00001-000', 'corpus_1a76ed9f-2019-04-18T16:07:27Z-00002-000', 'corpus_1a76ed9f-2019-04-18T16:07:27Z-00005-000', 'corpus_1b03f390-2019-04-18T18:42:36Z-00003-000', 'corpus_1b03f390-2019-04-18T18:42:36Z-00004-000', 'corpus_1b03f390-2019-04-18T18:42:36Z-00007-000', 'corpus_1b03f390-2019-04-18T18:42:36Z-00009-000', 'corpus_2345fbf4-2019-04-18T14:09:46Z-00004-000', 'corpus_24e47090-2019-04-18T19:22:46Z-00003-000', 'corpus_24e47090-2019-04-18T19:22:46Z-00004-000', 'corpus_302aacc2-2019-04-18T15:07:51Z-00000-000', 'corpus_302aacc2-2019-04-18T15:07:51Z-00003-000', 'corpus_302aacc2-2019-04-18T15:07:51Z-00007-000', 'corpus_39b69e06-2019-04-18T19:45:40Z-00002-000', 'corpus_42f1857a-2019-04-18T12:55:34Z-00001-000', 'corpus_430e61ef-2019-04-18T11:40:00Z-00001-000', 'corpus_430e620e-2019-04-18T11:38:45Z-00001-000', 'corpus_46d2aa82-2019-04-18T18:12:11Z-00001-000', '