In [3]:
import json
import math
import os
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter, defaultdict
import re
from typing import List, Dict, Tuple, Set, Any
from rank_bm25 import BM25Okapi

In [2]:
%pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/opt/homebrew/Cellar/jupyterlab/4.2.1/libexec/bin/python -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [11]:
print("✅ Working directory set to:", os.getcwd())
os.chdir(os.path.join(os.getcwd(), "CS6200InformationRetrievalProject"))
print("✅ Current working directory:", os.getcwd())

✅ Working directory set to: /Users/shailshah/Desktop/NEU MSCS/Sem4/IR/Final Project
✅ Current working directory: /Users/shailshah/Desktop/NEU MSCS/Sem4/IR/Final Project/CS6200InformationRetrievalProject


In [4]:
def preprocess_text(text: str) -> List[str]:
    return text.lower().split()

def load_documents(file_path: str) -> Tuple[List[Dict[str, Any]], List[List[str]]]:
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    raw_docs = []
    tokenized_docs = []
    
    # Extract documents from the JSON structure
    if 'documents' in data:
        for doc in data['documents']:
            if 'document_content' in doc:
                # Check if document_content is a JSON string
                if isinstance(doc['document_content'], str):
                    try:
                        content_json = json.loads(doc['document_content'])
                        if 'documents' in content_json:
                            for inner_doc in content_json['documents']:
                                doc_content = inner_doc.get('content', '')
                                raw_docs.append({
                                    'doc_id': inner_doc.get('doc_id', len(raw_docs) + 1),
                                    'title': inner_doc.get('title', ''),
                                    'content': doc_content
                                })
                                tokenized_docs.append(preprocess_text(doc_content))
                    except json.JSONDecodeError:
                        # Treat as regular text
                        doc_content = doc['document_content']
                        raw_docs.append({
                            'doc_id': doc.get('index', len(raw_docs) + 1),
                            'title': doc.get('source', ''),
                            'content': doc_content
                        })
                        tokenized_docs.append(preprocess_text(doc_content))
                else:
                    # Handle JSON content
                    if 'documents' in doc['document_content']:
                        for inner_doc in doc['document_content']['documents']:
                            doc_content = inner_doc.get('content', '')
                            raw_docs.append({
                                'doc_id': inner_doc.get('doc_id', len(raw_docs) + 1),
                                'title': inner_doc.get('title', ''),
                                'content': doc_content
                            })
                            tokenized_docs.append(preprocess_text(doc_content))
    
    # If no documents were found with the above structure, try a different approach
    if len(raw_docs) == 0:
        if 'documents' in data:
            for doc in data['documents']:
                if isinstance(doc, dict):
                    doc_content = doc.get('content', '')
                    raw_docs.append({
                        'doc_id': doc.get('doc_id', len(raw_docs) + 1),
                        'title': doc.get('title', ''),
                        'content': doc_content
                    })
                    tokenized_docs.append(preprocess_text(doc_content))
    
    return raw_docs, tokenized_docs

def load_queries(file_path: str) -> List[Dict[str, Any]]:
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    if 'queries' in data:
        return data['queries']
    return []

def rank_queries(bm25: BM25Okapi, queries: List[Dict[str, Any]], raw_docs: List[Dict[str, Any]]) -> Dict[int, List[Tuple[int, float]]]:
    rankings = {}
    
    for query in queries:
        query_id = query['query_id']
        query_text = query['query']
        tokenized_query = preprocess_text(query_text)
        
        scores = bm25.get_scores(tokenized_query)
        
        # Create (doc_id, score) pairs
        ranked_docs = [(raw_docs[idx]['doc_id'], score) for idx, score in enumerate(scores)]
        
        # Sort by score in descending order
        ranked_docs = sorted(ranked_docs, key=lambda x: x[1], reverse=True)
        rankings[query_id] = ranked_docs
    return rankings

def plot_rankings(rankings: Dict[int, List[Tuple[int, float]]], output_dir: str, file_name: str):
    os.makedirs(output_dir, exist_ok=True)
    
    # Plot 1: Distribution of scores for top 10 documents for each query
    plt.figure(figsize=(12, 8))
    all_query_ids = sorted(rankings.keys())
    for query_id in all_query_ids:
        top_docs = rankings[query_id][:10]  # Top 10 documents
        scores = [score for _, score in top_docs]
        
        plt.plot(range(1, len(scores) + 1), scores, marker='o', label=f'Query {query_id}')
    
    plt.xlabel('Document Rank')
    plt.ylabel('BM25 Score')
    plt.title(f'Top 10 Document Scores per Query for {file_name}')
    plt.legend(loc='best')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig(f"{output_dir}/{file_name}_score_distribution.png")
    plt.close()
    
    # Plot 2: Heatmap of document relevance across queries
    query_count = len(all_query_ids)
    doc_count = 20  # Only show top 20 documents per query for clarity
    
    # Create a matrix for the heatmap
    heatmap_data = np.zeros((query_count, doc_count))
    
    for i, query_id in enumerate(all_query_ids):
        top_docs = rankings[query_id][:doc_count]
        for j, (doc_id, score) in enumerate(top_docs):
            heatmap_data[i, j] = score
    
    plt.figure(figsize=(14, 10))
    plt.imshow(heatmap_data, cmap='viridis', aspect='auto')
    plt.colorbar(label='BM25 Score')
    plt.xlabel('Document Rank')
    plt.ylabel('Query ID')
    plt.title(f'BM25 Score Heatmap for {file_name}')
    plt.yticks(range(query_count), all_query_ids)
    plt.xticks(range(doc_count), range(1, doc_count + 1))
    plt.tight_layout()
    plt.savefig(f"{output_dir}/{file_name}_heatmap.png")
    plt.close()
    
    # Plot 3: Average score by rank position
    avg_scores = []
    for rank in range(doc_count):
        scores_at_rank = []
        for query_id in all_query_ids:
            if rank < len(rankings[query_id]):
                scores_at_rank.append(rankings[query_id][rank][1])
        
        if scores_at_rank:
            avg_scores.append(np.mean(scores_at_rank))
        else:
            avg_scores.append(0)
    
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, len(avg_scores) + 1), avg_scores, marker='o', linestyle='-', color='blue')
    plt.xlabel('Document Rank')
    plt.ylabel('Average BM25 Score')
    plt.title(f'Average BM25 Score by Rank Position for {file_name}')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig(f"{output_dir}/{file_name}_avg_score_by_rank.png")
    plt.close()

def write_rankings_to_file(rankings: Dict[int, List[Tuple[int, float]]], output_dir: str, file_name: str, raw_docs: List[Dict[str, Any]], queries: List[Dict[str, Any]]):
    os.makedirs(output_dir, exist_ok=True)
    
    # Create a mapping of doc_id to document info for quick lookup
    doc_id_to_info = {doc['doc_id']: doc for doc in raw_docs}
    
    # Create a mapping of query_id to query text for quick lookup
    query_id_to_text = {query['query_id']: query['query'] for query in queries}
    
    with open(f"{output_dir}Results/{file_name}_rankings.txt", 'w') as f:
        for query_id, docs in sorted(rankings.items()):
            query_text = query_id_to_text.get(query_id, "Unknown query")
            f.write(f"Query {query_id}: {query_text}\n")
            for rank, (doc_id, score) in enumerate(docs, 1):
                title = doc_id_to_info.get(doc_id, {}).get('title', 'Unknown Title')
                f.write(f"  Rank {rank}: Document {doc_id} - Title: {title} - Score: {score:.4f}\n")
            f.write("\n")

def compare_rankings(all_rankings: Dict[str, Dict[int, List[Tuple[int, float]]]], output_dir: str):
    os.makedirs(output_dir, exist_ok=True)
    
    # Get all query IDs
    all_query_ids = set()
    for rankings in all_rankings.values():
        all_query_ids.update(rankings.keys())
    all_query_ids = sorted(all_query_ids)
    
    # Compare average scores for the top 10 documents across files
    plt.figure(figsize=(14, 10))
    
    bar_width = 0.8 / len(all_rankings)
    index = np.arange(len(all_query_ids))
    
    for i, (file_name, rankings) in enumerate(sorted(all_rankings.items())):
        avg_scores = []
        
        for query_id in all_query_ids:
            if query_id in rankings:
                top_docs = rankings[query_id][:10]  # Top 10 documents
                if top_docs:
                    avg_scores.append(np.mean([score for _, score in top_docs]))
                else:
                    avg_scores.append(0)
            else:
                avg_scores.append(0)
        
        plt.bar(index + i * bar_width, avg_scores, bar_width, label=file_name)
    
    plt.xlabel('Query ID')
    plt.ylabel('Average BM25 Score (Top 10 Documents)')
    plt.title('Comparison of Average BM25 Scores Across Document Collections')
    plt.xticks(index + bar_width * (len(all_rankings) - 1) / 2, all_query_ids)
    plt.legend(loc='best')
    plt.tight_layout()
    plt.savefig(f"{output_dir}/comparison_avg_scores.png")
    plt.close()
    
    # Compare number of documents with scores above threshold
    threshold = 5.0  # Example threshold
    plt.figure(figsize=(14, 10))
    
    for i, (file_name, rankings) in enumerate(sorted(all_rankings.items())):
        high_scoring_docs = []
        
        for query_id in all_query_ids:
            if query_id in rankings:
                count = sum(1 for _, score in rankings[query_id] if score > threshold)
                high_scoring_docs.append(count)
            else:
                high_scoring_docs.append(0)
        
        plt.bar(index + i * bar_width, high_scoring_docs, bar_width, label=file_name)
    
    plt.xlabel('Query ID')
    plt.ylabel(f'Number of Documents with Score > {threshold}')
    plt.title(f'Comparison of High-Scoring Documents Across Collections (Threshold: {threshold})')
    plt.xticks(index + bar_width * (len(all_rankings) - 1) / 2, all_query_ids)
    plt.legend(loc='best')
    plt.tight_layout()
    plt.savefig(f"{output_dir}/comparison_high_scoring_docs.png")
    plt.close()


In [5]:
def create_relevance_judgments(queries: List[Dict[str, Any]], raw_docs: List[Dict[str, Any]]) -> Dict[int, Dict[int, float]]:
    from collections import Counter
    import math
    
    def compute_tf(text_tokens):
        """Compute term frequency"""
        tf_dict = Counter(text_tokens)
        # Normalize by document length
        for term in tf_dict:
            tf_dict[term] = tf_dict[term] / len(text_tokens)
        return tf_dict
    
    def compute_idf(corpus_tokens):
        """Compute inverse document frequency"""
        idf_dict = {}
        num_docs = len(corpus_tokens)
        
        # Count documents containing each term
        doc_count = Counter()
        for doc_tokens in corpus_tokens:
            unique_terms = set(doc_tokens)
            for term in unique_terms:
                doc_count[term] += 1
        
        # Calculate IDF for each term
        for term, count in doc_count.items():
            idf_dict[term] = math.log(num_docs / (1 + count))
        
        return idf_dict
    
    def compute_tfidf(tf_dict, idf_dict):
        """Compute TF-IDF scores"""
        tfidf_dict = {}
        for term, tf_value in tf_dict.items():
            tfidf_dict[term] = tf_value * idf_dict.get(term, 0)
        return tfidf_dict
    
    def cosine_similarity(vec1, vec2):
        """Compute cosine similarity between two vectors"""
        # Find common terms
        common_terms = set(vec1.keys()) & set(vec2.keys())
        
        # Calculate dot product
        dot_product = sum(vec1[term] * vec2[term] for term in common_terms)
        
        # Calculate magnitudes
        magnitude1 = math.sqrt(sum(value ** 2 for value in vec1.values()))
        magnitude2 = math.sqrt(sum(value ** 2 for value in vec2.values()))
        
        # Avoid division by zero
        if magnitude1 == 0 or magnitude2 == 0:
            return 0.0
        
        # Return cosine similarity
        return dot_product / (magnitude1 * magnitude2)
    
    relevance_judgments = {}
    
    # Preprocess all documents for IDF calculation
    all_doc_tokens = [preprocess_text(doc.get('content', '').lower()) for doc in raw_docs]
    idf_dict = compute_idf(all_doc_tokens)
    
    # Process each query
    for query in queries:
        query_id = query['query_id']
        query_text = query['query'].lower()
        narrative = query.get('narrative', '').lower()
        
        # Initialize relevance judgments for this query
        relevance_judgments[query_id] = {}
        
        # Combine query and narrative for a more comprehensive representation
        # With higher weight for query terms
        combined_query = query_text + " " + query_text + " " + narrative
        query_tokens = preprocess_text(combined_query)
        
        # Compute query TF-IDF
        query_tf = compute_tf(query_tokens)
        query_tfidf = compute_tfidf(query_tf, idf_dict)
        
        # For each document, compute relevance
        for i, doc in enumerate(raw_docs):
            doc_id = doc['doc_id']
            title = doc.get('title', '').lower()
            content = doc.get('content', '').lower()
            
            # Give title terms more weight
            combined_doc = title + " " + title + " " + content
            doc_tokens = preprocess_text(combined_doc)
            
            # Compute document TF-IDF
            doc_tf = compute_tf(doc_tokens)
            doc_tfidf = compute_tfidf(doc_tf, idf_dict)
            
            # Calculate relevance using cosine similarity
            similarity = cosine_similarity(query_tfidf, doc_tfidf)
            
            # Additional boost for exact title match
            if query_text in title:
                similarity += 0.2
                
            # Clamp to [0, 1] range
            relevance = min(1.0, max(0.0, similarity))
            
            # Threshold - only include if somewhat relevant (stricter threshold)
            if relevance > 0.15:
                relevance_judgments[query_id][doc_id] = relevance
    
    return relevance_judgments

def mean_average_precision_at_k(rankings: Dict[int, List[Tuple[int, float]]], relevance_judgments: Dict[int, Dict[int, float]], k: int = 10) -> Dict[int, float]:
    map_at_k = {}
    
    for query_id, ranked_docs in rankings.items():
        if query_id not in relevance_judgments:
            map_at_k[query_id] = 0.0
            continue
        
        relevant_docs = relevance_judgments[query_id]
        
        precisions = []
        relevant_count = 0
        
        for i, (doc_id, _) in enumerate(ranked_docs[:k]):
            if doc_id in relevant_docs:
                relevant_count += 1
                precisions.append(relevant_count / (i + 1))
        
        if len(precisions) > 0:
            map_at_k[query_id] = sum(precisions) / len(precisions)
        else:
            map_at_k[query_id] = 0.0
    
    return map_at_k

def recall_at_k(rankings: Dict[int, List[Tuple[int, float]]], relevance_judgments: Dict[int, Dict[int, float]], k: int = 10) -> Dict[int, float]:
    recall_at_k = {}
    
    for query_id, ranked_docs in rankings.items():
        if query_id not in relevance_judgments:
            recall_at_k[query_id] = 0.0
            continue
        
        relevant_docs = relevance_judgments[query_id]
        
        if len(relevant_docs) == 0:
            recall_at_k[query_id] = 0.0
            continue
        
        retrieved_relevant = sum(1 for doc_id, _ in ranked_docs[:k] if doc_id in relevant_docs)
        recall_at_k[query_id] = retrieved_relevant / len(relevant_docs)
    
    return recall_at_k

def dcg_at_k(rankings: Dict[int, List[Tuple[int, float]]], relevance_judgments: Dict[int, Dict[int, float]], k: int = 10) -> Dict[int, float]:
    dcg_at_k = {}
    
    for query_id, ranked_docs in rankings.items():
        if query_id not in relevance_judgments:
            dcg_at_k[query_id] = 0.0
            continue
        
        relevant_docs = relevance_judgments[query_id]
        
        dcg = 0.0
        for i, (doc_id, _) in enumerate(ranked_docs[:k]):
            # Use binary relevance for simplicity (1 if relevant, 0 if not)
            rel = relevant_docs.get(doc_id, 0.0)
            
            # Position is 1-based
            position = i + 1
            
            # DCG formula
            dcg += (2 ** rel - 1) / math.log2(position + 1)
        
        dcg_at_k[query_id] = dcg
    
    return dcg_at_k

def idcg_at_k(relevance_judgments: Dict[int, Dict[int, float]], k: int = 10) -> Dict[int, float]:
    idcg_at_k = {}
    
    for query_id, relevant_docs in relevance_judgments.items():
        # Sort relevance scores in descending order
        sorted_relevance = sorted(relevant_docs.values(), reverse=True)
        
        idcg = 0.0
        for i, rel in enumerate(sorted_relevance[:k]):
            # Position is 1-based
            position = i + 1
            
            # IDCG formula
            idcg += (2 ** rel - 1) / math.log2(position + 1)
        
        idcg_at_k[query_id] = idcg
    
    return idcg_at_k

def ndcg_at_k(rankings: Dict[int, List[Tuple[int, float]]], 
              relevance_judgments: Dict[int, Dict[int, float]], 
              k: int = 10) -> Dict[int, float]:
    dcg = dcg_at_k(rankings, relevance_judgments, k)
    idcg = idcg_at_k(relevance_judgments, k)
    
    ndcg_at_k = {}
    
    for query_id in rankings.keys():
        if query_id not in idcg or idcg[query_id] == 0.0:
            ndcg_at_k[query_id] = 0.0
        else:
            ndcg_at_k[query_id] = dcg[query_id] / idcg[query_id]
    
    return ndcg_at_k

def calculate_metrics(rankings: Dict[int, List[Tuple[int, float]]], relevance_judgments: Dict[int, Dict[int, float]], k: int = 10) -> Dict[str, Dict[int, float]]:
    metrics = {}
    
    metrics['map'] = mean_average_precision_at_k(rankings, relevance_judgments, k)
    metrics['recall'] = recall_at_k(rankings, relevance_judgments, k)
    metrics['ndcg'] = ndcg_at_k(rankings, relevance_judgments, k)
    
    return metrics

def plot_metrics(metrics: Dict[str, Dict[int, float]], output_dir: str, file_name: str):
    os.makedirs(output_dir, exist_ok=True)
    
    all_query_ids = sorted(set().union(*[set(m.keys()) for m in metrics.values()]))
    
    # Plot 1: Bar chart of MAP@10 for each query
    plt.figure(figsize=(12, 8))
    
    map_values = [metrics['map'].get(query_id, 0.0) for query_id in all_query_ids]
    plt.bar(range(len(all_query_ids)), map_values, color='blue', alpha=0.7)
    
    plt.xlabel('Query ID')
    plt.ylabel('MAP@10')
    plt.title(f'Mean Average Precision@10 per Query for {file_name}')
    plt.xticks(range(len(all_query_ids)), all_query_ids)
    plt.grid(True, linestyle='--', alpha=0.7, axis='y')
    plt.tight_layout()
    plt.savefig(f"{output_dir}/{file_name}_map_at_10.png")
    plt.close()
    
    # Plot 2: Bar chart of Recall@10 for each query
    plt.figure(figsize=(12, 8))
    
    recall_values = [metrics['recall'].get(query_id, 0.0) for query_id in all_query_ids]
    plt.bar(range(len(all_query_ids)), recall_values, color='green', alpha=0.7)
    
    plt.xlabel('Query ID')
    plt.ylabel('Recall@10')
    plt.title(f'Recall@10 per Query for {file_name}')
    plt.xticks(range(len(all_query_ids)), all_query_ids)
    plt.grid(True, linestyle='--', alpha=0.7, axis='y')
    plt.tight_layout()
    plt.savefig(f"{output_dir}/{file_name}_recall_at_10.png")
    plt.close()
    
    # Plot 3: Bar chart of NDCG@10 for each query
    plt.figure(figsize=(12, 8))
    
    ndcg_values = [metrics['ndcg'].get(query_id, 0.0) for query_id in all_query_ids]
    plt.bar(range(len(all_query_ids)), ndcg_values, color='purple', alpha=0.7)
    
    plt.xlabel('Query ID')
    plt.ylabel('NDCG@10')
    plt.title(f'Normalized Discounted Cumulative Gain@10 per Query for {file_name}')
    plt.xticks(range(len(all_query_ids)), all_query_ids)
    plt.grid(True, linestyle='--', alpha=0.7, axis='y')
    plt.tight_layout()
    plt.savefig(f"{output_dir}/{file_name}_ndcg_at_10.png")
    plt.close()
    
    # Plot 4: Combined metric comparison
    plt.figure(figsize=(14, 10))
    
    x = np.arange(len(all_query_ids))
    width = 0.25
    
    plt.bar(x - width, map_values, width, label='MAP@10', color='blue', alpha=0.7)
    plt.bar(x, recall_values, width, label='Recall@10', color='green', alpha=0.7)
    plt.bar(x + width, ndcg_values, width, label='NDCG@10', color='purple', alpha=0.7)
    
    plt.xlabel('Query ID')
    plt.ylabel('Metric Value')
    plt.title(f'Evaluation Metrics Comparison for {file_name}')
    plt.xticks(x, all_query_ids)
    plt.legend(loc='best')
    plt.grid(True, linestyle='--', alpha=0.7, axis='y')
    plt.tight_layout()
    plt.savefig(f"{output_dir}/{file_name}_metrics_comparison.png")
    plt.close()

def write_metrics_to_file(metrics: Dict[str, Dict[int, float]], output_dir: str, file_name: str):
    os.makedirs(output_dir, exist_ok=True)
    
    with open(f"{output_dir}Results/{file_name}_metrics.txt", 'w') as f:
        all_query_ids = sorted(set().union(*[set(m.keys()) for m in metrics.values()]))
        
        f.write(f"Evaluation Metrics for {file_name}:\n\n")
        
        # Per-query metrics
        f.write("Per-Query Metrics:\n")
        f.write("Query ID | MAP@10  | Recall@10 | NDCG@10\n")
        f.write("-" * 45 + "\n")
        
        for query_id in all_query_ids:
            map_value = metrics['map'].get(query_id, 0.0)
            recall_value = metrics['recall'].get(query_id, 0.0)
            ndcg_value = metrics['ndcg'].get(query_id, 0.0)
            
            f.write(f"{query_id:8} | {map_value:.4f} | {recall_value:.4f}  | {ndcg_value:.4f}\n")
        
        # Overall averages
        f.write("\nOverall Averages:\n")
        avg_map = sum(metrics['map'].values()) / len(metrics['map']) if metrics['map'] else 0.0
        avg_recall = sum(metrics['recall'].values()) / len(metrics['recall']) if metrics['recall'] else 0.0
        avg_ndcg = sum(metrics['ndcg'].values()) / len(metrics['ndcg']) if metrics['ndcg'] else 0.0
        
        f.write(f"Average MAP@10:     {avg_map:.4f}\n")
        f.write(f"Average Recall@10:  {avg_recall:.4f}\n")
        f.write(f"Average NDCG@10:    {avg_ndcg:.4f}\n")

def compare_metrics(all_metrics: Dict[str, Dict[str, Dict[int, float]]], output_dir: str):
    os.makedirs(output_dir, exist_ok=True)
    
    # Get all metric types and query IDs
    metric_types = set()
    all_query_ids = set()
    
    for file_metrics in all_metrics.values():
        metric_types.update(file_metrics.keys())
        for metric_values in file_metrics.values():
            all_query_ids.update(metric_values.keys())
    
    metric_types = sorted(metric_types)
    all_query_ids = sorted(all_query_ids)
    
    # Compare average metrics across files
    for metric_type in metric_types:
        plt.figure(figsize=(14, 10))
        
        bar_width = 0.8 / len(all_metrics)
        index = np.arange(len(all_query_ids))
        
        for i, (file_name, file_metrics) in enumerate(sorted(all_metrics.items())):
            metric_values = []
            
            for query_id in all_query_ids:
                if metric_type in file_metrics and query_id in file_metrics[metric_type]:
                    metric_values.append(file_metrics[metric_type][query_id])
                else:
                    metric_values.append(0.0)
            
            plt.bar(index + i * bar_width, metric_values, bar_width, label=file_name)
        
        plt.xlabel('Query ID')
        plt.ylabel(f'{metric_type.upper()}@10')
        plt.title(f'Comparison of {metric_type.upper()}@10 Across Document Collections')
        plt.xticks(index + bar_width * (len(all_metrics) - 1) / 2, all_query_ids)
        plt.legend(loc='best')
        plt.tight_layout()
        plt.savefig(f"{output_dir}/comparison_{metric_type}_at_10.png")
        plt.close()
    
    # Compare overall average metrics across files
    plt.figure(figsize=(12, 8))
    
    bar_width = 0.8 / len(metric_types)
    index = np.arange(len(all_metrics))
    
    for i, metric_type in enumerate(metric_types):
        avg_metrics = []
        
        for file_name, file_metrics in sorted(all_metrics.items()):
            if metric_type in file_metrics:
                avg_metric = sum(file_metrics[metric_type].values()) / len(file_metrics[metric_type]) if file_metrics[metric_type] else 0.0
                avg_metrics.append(avg_metric)
            else:
                avg_metrics.append(0.0)
        
        plt.bar(index + i * bar_width, avg_metrics, bar_width, label=f'{metric_type.upper()}@10')
    
    plt.xlabel('Document Collection')
    plt.ylabel('Average Metric Value')
    plt.title('Comparison of Average Evaluation Metrics Across Document Collections')
    plt.xticks(index + bar_width * (len(metric_types) - 1) / 2, [file_name for file_name in sorted(all_metrics.keys())])
    plt.legend(loc='best')
    plt.tight_layout()
    plt.savefig(f"{output_dir}/comparison_avg_metrics.png")
    plt.close()

In [12]:
# Directory paths
input_dir = "data"
output_dir = "rankedoutputBM25"

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Load queries
queries_file = f"{input_dir}/queries.json"
queries = load_queries(queries_file)

# Process all document files
document_files = [f for f in os.listdir(input_dir) if f.startswith("documents") and f.endswith(".json")]

all_rankings = {}
all_metrics = {}

for doc_file in document_files:
    file_path = f"{input_dir}/{doc_file}"
    file_name = os.path.splitext(doc_file)[0]
    
    print(f"Processing {file_name}...")
    
    # Load documents
    raw_docs, tokenized_docs = load_documents(file_path)
    
    if len(raw_docs) == 0:
        print(f"No documents found in {file_name}, skipping.")
        continue
    
    # Initialize BM25Okapi
    bm25 = BM25Okapi(tokenized_docs)
    
    # Rank queries
    rankings = rank_queries(bm25, queries, raw_docs)
    
    # Create pseudo-relevance judgments
    relevance_judgments = create_relevance_judgments(queries, raw_docs)
    
    # Calculate evaluation metrics
    metrics = calculate_metrics(rankings, relevance_judgments)
    
    # Store rankings and metrics for comparison
    all_rankings[file_name] = rankings
    all_metrics[file_name] = metrics
    
    # Generate plots for this document collection
    plot_rankings(rankings, output_dir, file_name)
    plot_metrics(metrics, output_dir, file_name)
    
    # Write rankings and metrics to file
    write_rankings_to_file(rankings, output_dir, file_name, raw_docs, queries)
    write_metrics_to_file(metrics, output_dir, file_name)

# Compare rankings and metrics across document collections
if len(all_rankings) > 1:
    compare_rankings(all_rankings, output_dir)
    compare_metrics(all_metrics, output_dir)

print("All processing complete!")

Processing documentsOriginal...
Processing documentsBart...
Processing documentsLongformer...
Processing documentsT5...
All processing complete!
