In [18]:
import sys
sys.path.append('/content/cisi-ir/src')


In [19]:
from parse_cisi import parse_cisi_docs, parse_cisi_queries, parse_cisi_qrels
from preprocess import preprocess
from index_tfidf import build_tfidf_matrix, tfidf_rank
from index_bm25 import BM25, bm25_rank
from evaluate import evaluate_runs


In [20]:
# Parse dataset
DOCS = parse_cisi_docs('/content/cisi-ir/data/CISI.ALL')
QUERIES = parse_cisi_queries('/content/cisi-ir/data/CISI.QRY')
QRELS = parse_cisi_qrels('/content/cisi-ir/data/CISI.REL')

print("Parsed:", len(DOCS), "documents,", len(QUERIES), "queries,", len(QRELS), "judged queries.")

# Preprocess
DOC_IDS = sorted(DOCS.keys())
DOC_TOKENS = [preprocess(DOCS[doc_id]['text']) for doc_id in DOC_IDS]

Q_IDS = sorted(QUERIES.keys())
Q_TOKENS = [preprocess(QUERIES[qid]) for qid in Q_IDS]

print("Preprocessed:", len(DOC_TOKENS), "documents and", len(Q_TOKENS), "queries.")


Parsed: 1459 documents, 111 queries, 76 judged queries.
Preprocessed: 1459 documents and 111 queries.


In [21]:
from index_tfidf import build_tfidf_matrix, tfidf_rank

# Build TF-IDF matrix from preprocessed documents
vectorizer, TFIDF_MATRIX = build_tfidf_matrix(DOC_TOKENS)

print("TF-IDF matrix shape:", TFIDF_MATRIX.shape)


TF-IDF matrix shape: (1459, 8318)


In [22]:
qid = Q_IDS[1]   # Query 2
print("Query:", QUERIES[qid])
print("Preprocessed:", Q_TOKENS[1])

print("\nTop 10 TF-IDF results:")
for docid, score in tfidf_rank(Q_TOKENS[1], vectorizer, TFIDF_MATRIX, DOC_IDS, topk=10):
    print(f"Doc {docid} | Score: {score:.4f}")
    print("Snippet:", DOCS[docid]['text'][:150])
    print("---")


Query: What is information science?  Give definitions where possible.
Preprocessed: ['information', 'science', 'give', 'definition', 'possible']

Top 10 TF-IDF results:
Doc 469 | Score: 0.4537
Snippet: The Phenomena of Interest to Information Science Discusses the various explicit and implicit definitions of information and
information science, again
---
Doc 1181 | Score: 0.2981
Snippet: The Origins of the Information Crisis:  A Contribution to the Statement 
of the Problem The different explanations of the nature of the information pr
---
Doc 445 | Score: 0.2874
Snippet: A Definition of Relevance for Information Retrieval The concept of "relevance", sometimes also called "pertinence" or 
"aboutness", is central to the 
---
Doc 599 | Score: 0.2349
Snippet: On the Evaluation of Information Science The emergence and development of information science within its wider 
disciplinary framework is interpreted.
---
Doc 1142 | Score: 0.2316
Snippet: Science on science - Introduction to a gener

In [23]:
def run_all_queries_tfidf(topk=100):
    runs = {}
    for qi, qid in enumerate(Q_IDS):
        query_tokens = Q_TOKENS[qi]
        ranked = tfidf_rank(query_tokens, vectorizer, TFIDF_MATRIX, DOC_IDS, topk=topk)
        runs[qid] = ranked
    return runs

RUNS_TFIDF = run_all_queries_tfidf(topk=100)
print("TF-IDF retrieval complete for all queries.")


TF-IDF retrieval complete for all queries.


In [24]:
from index_bm25 import BM25, bm25_rank

# Build BM25 model from preprocessed documents
BM = BM25(DOC_TOKENS, k1=1.2, b=0.75)
print("BM25 model built with", BM.N, "documents.")


BM25 model built with 1459 documents.


In [25]:
qid = Q_IDS[1]   # Query 2
print("Query:", QUERIES[qid])
print("Preprocessed:", Q_TOKENS[1])

print("\nTop 10 BM25 results:")
for docid, score in bm25_rank(Q_TOKENS[1], BM, DOC_IDS, topk=10):
    print(f"Doc {docid} | Score: {score:.4f}")
    print("Snippet:", DOCS[docid]['text'][:150])
    print("---")


Query: What is information science?  Give definitions where possible.
Preprocessed: ['information', 'science', 'give', 'definition', 'possible']

Top 10 BM25 results:
Doc 1181 | Score: 15.0838
Snippet: The Origins of the Information Crisis:  A Contribution to the Statement 
of the Problem The different explanations of the nature of the information pr
---
Doc 469 | Score: 10.8018
Snippet: The Phenomena of Interest to Information Science Discusses the various explicit and implicit definitions of information and
information science, again
---
Doc 540 | Score: 9.4661
Snippet: Information:  Methodology This book sheds light on basic problems, principles and results of
philosophical-methodological research in information conc
---
Doc 1077 | Score: 8.8360
Snippet: Comments about Terminology in Documentation.
II:  communication and Information Developing from the definitions of the concept language a terminologic
---
Doc 1179 | Score: 8.4393
Snippet: Topical Aspects of Informatics to-date A de

In [26]:
def run_all_queries_bm25(topk=100):
    runs = {}
    for qi, qid in enumerate(Q_IDS):
        query_tokens = Q_TOKENS[qi]
        ranked = bm25_rank(query_tokens, BM, DOC_IDS, topk=topk)
        runs[qid] = ranked
    return runs

RUNS_BM25 = run_all_queries_bm25(topk=100)
print("BM25 retrieval complete for all queries.")


BM25 retrieval complete for all queries.


In [27]:
from evaluate import evaluate_runs


In [28]:
# Evaluate TF-IDF
EVAL_TFIDF = evaluate_runs(RUNS_TFIDF, QRELS, k=10)

# Evaluate BM25
EVAL_BM25 = evaluate_runs(RUNS_BM25, QRELS, k=10)

print("TF-IDF Evaluation:", EVAL_TFIDF)
print("BM25 Evaluation:", EVAL_BM25)


TF-IDF Evaluation: {'MAP': 0.16543082819363333, 'Precision@10': 0.3226666666666666, 'nDCG@10': 0.35218192674531174, 'MRR': 0.5762733992813037}
BM25 Evaluation: {'MAP': 0.15880527984606788, 'Precision@10': 0.3413333333333334, 'nDCG@10': 0.37738715910574583, 'MRR': 0.6186361416361416}


In [29]:
import csv

def save_runs_to_csv(runs, filename):
    with open(filename, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['query_id', 'rank', 'doc_id', 'score'])
        for qid, ranked_docs in runs.items():
            for rank, (docid, score) in enumerate(ranked_docs, 1):
                writer.writerow([qid, rank, docid, score])

# Save both models
save_runs_to_csv(RUNS_TFIDF, '/content/cisi-ir/results/tfidf_runs.csv')
save_runs_to_csv(RUNS_BM25, '/content/cisi-ir/results/bm25_runs.csv')

print("Results saved to CSV files in results/ folder.")


Results saved to CSV files in results/ folder.


In [30]:
import json

eval_results = {
    'TF-IDF': EVAL_TFIDF,
    'BM25': EVAL_BM25
}

with open('/content/cisi-ir/results/evaluation.json', 'w') as f:
    json.dump(eval_results, f, indent=2)

print("Evaluation metrics saved to results/evaluation.json")


Evaluation metrics saved to results/evaluation.json


In [31]:
qid = Q_IDS[1]   # Query 2
print("Query:", QUERIES[qid])
print("Preprocessed:", Q_TOKENS[1])

print("\nTop 5 TF-IDF results:")
for docid, score in RUNS_TFIDF[qid][:5]:
    print(f"Doc {docid} | Score: {score:.4f}")
    print("Snippet:", DOCS[docid]['text'][:150])
    print("---")

print("\nTop 5 BM25 results:")
for docid, score in RUNS_BM25[qid][:5]:
    print(f"Doc {docid} | Score: {score:.4f}")
    print("Snippet:", DOCS[docid]['text'][:150])
    print("---")


Query: What is information science?  Give definitions where possible.
Preprocessed: ['information', 'science', 'give', 'definition', 'possible']

Top 5 TF-IDF results:
Doc 469 | Score: 0.4537
Snippet: The Phenomena of Interest to Information Science Discusses the various explicit and implicit definitions of information and
information science, again
---
Doc 1181 | Score: 0.2981
Snippet: The Origins of the Information Crisis:  A Contribution to the Statement 
of the Problem The different explanations of the nature of the information pr
---
Doc 445 | Score: 0.2874
Snippet: A Definition of Relevance for Information Retrieval The concept of "relevance", sometimes also called "pertinence" or 
"aboutness", is central to the 
---
Doc 599 | Score: 0.2349
Snippet: On the Evaluation of Information Science The emergence and development of information science within its wider 
disciplinary framework is interpreted.
---
Doc 1142 | Score: 0.2316
Snippet: Science on science - Introduction to a genera

In [39]:
pip install -r requirements.txt


In [42]:
# Initialize Git repo
!git init

# Add all files
!git add .

# Commit with a message
!git commit -m "Initial commit: CISI IR system"

# Push to GitHub
!git branch -M main
!git remote add origin https://github.com/MohitKhetan10/cisi-ir.git
!git push -u origin main


[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /content/cisi-ir/.git/
[master (root-commit) 04553ff] Initial commit: CISI IR system
 17 files changed, 135792 insertions(+)
 create mode 100644 README.md
 create mode 100644 data/CISI.ALL
 create mode 100644 data/CISI.QRY
 create mode 100644 data/CISI.REL
 create mode 100644 results/bm25_runs.csv
 create mode 100644 results/evaluation.json
 create mode 100644 results/tfidf_runs.csv
 create mode 100644 src/__pycache__/evaluate.cpython-312.pyc
 create mode 100644 src/__pycach

In [43]:
!git remote set-url origin https://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@github.com/MohitKhetan10/cisi-ir.git
!git push -u origin main


Enumerating objects: 23, done.
Counting objects:   4% (1/23)Counting objects:   8% (2/23)Counting objects:  13% (3/23)Counting objects:  17% (4/23)Counting objects:  21% (5/23)Counting objects:  26% (6/23)Counting objects:  30% (7/23)Counting objects:  34% (8/23)Counting objects:  39% (9/23)Counting objects:  43% (10/23)Counting objects:  47% (11/23)Counting objects:  52% (12/23)Counting objects:  56% (13/23)Counting objects:  60% (14/23)Counting objects:  65% (15/23)Counting objects:  69% (16/23)Counting objects:  73% (17/23)Counting objects:  78% (18/23)Counting objects:  82% (19/23)Counting objects:  86% (20/23)Counting objects:  91% (21/23)Counting objects:  95% (22/23)Counting objects: 100% (23/23)Counting objects: 100% (23/23), done.
Delta compression using up to 2 threads
Compressing objects: 100% (23/23), done.
Writing objects: 100% (23/23), 1.03 MiB | 3.38 MiB/s, done.
Total 23 (delta 0), reused 0 (delta 0), pack-reused 0
To https://github.com/MohitKheta

In [44]:
!mv /content/cisi_ir_demo.ipynb /content/cisi-ir/notebooks/


mv: cannot stat '/content/cisi_ir_demo.ipynb': No such file or directory
