# Manish Kanuri
# NUID: 002315456

In [134]:
from sklearn.datasets import fetch_20newsgroups
import re
import numpy as np

In [135]:


# Selecting 10 categories of interest
categories = [
    'alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
    'rec.autos', 'rec.sport.baseball', 'sci.crypt', 'sci.space', 'talk.politics.guns', 'talk.religion.misc'
]

# Loading the dataset
newsgroups = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

#  preprocessing 
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\W+', ' ', text)  # Remove non-alphanumeric characters
    text = re.sub(r'\d+', '', text)  # Remove digits
    return text

# Apply preprocessing
documents = [preprocess_text(doc) for doc in newsgroups.data]

# Store preprocessed dataset
document_ids = np.arange(len(documents))  

print(f"Loaded {len(documents)} documents from selected categories.")


Loaded 9239 documents from selected categories.


In [136]:
# Define 10 queries related to the selected newsgroups categories
queries = [
    "Evolution of computer graphics",
    "Best practices for PC hardware upgrades",
    "Latest advancements in space technology",
    "Cryptography and data security techniques",
    "Effects of atheism on modern society",
    "Impact of gun control laws",
    "Automobile safety and new technologies",
    "Religious debates in online forums",
    "Windows OS performance optimization",
    "Baseball analytics and player statistics"
]

print("Queries defined successfully.")


Queries defined successfully.


# Task 4: Information Retrieval Task

In [138]:
# Simulating manual rankings (1: relevant, 0: not relevant)
np.random.seed(42)  # For reproducibility
manual_relevance = np.random.randint(0, 2, size=(10, 10))  # 10 queries, 10 relevant docs each

print("Manual relevance ranking assigned.")


Manual relevance ranking assigned.


In [139]:
# Simulating document embeddings (replace with real embeddings)
embedding_dim = 300  # Assume 300-dimensional embeddings
num_documents = len(documents)

# Generate embeddings for documents and queries
document_embeddings = np.random.rand(num_documents, embedding_dim)
query_embeddings = np.random.rand(10, embedding_dim)  # 10 queries

print("Document embeddings generated.")


Document embeddings generated.


In [140]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity between queries and documents
similarity_matrix = cosine_similarity(query_embeddings, document_embeddings)

# Retrieve top 10 documents for each query
top_k = 10
retrieved_docs = np.argsort(similarity_matrix, axis=1)[:, -top_k:][:, ::-1]  # Get top 10 ranked docs

print("Top 10 documents retrieved for each query.")


Top 10 documents retrieved for each query.


# Compute Precision@10:

In [142]:

precision_at_10 = np.sum(manual_relevance, axis=1) / top_k

# Print results
for i, query in enumerate(queries):
    print(f"Query: {query}")
    print(f"Top 10 Retrieved Documents: {retrieved_docs[i]}")
    print(f"Precision@10: {precision_at_10[i]:.2f}\n")

print("Precision@10 computed.")


Query: Evolution of computer graphics
Top 10 Retrieved Documents: [ 601  321 6852 2773 5343 8533  955 3324 5734 5699]
Precision@10: 0.30

Query: Best practices for PC hardware upgrades
Top 10 Retrieved Documents: [1699  964 4634  559 5941 8635 9159 8578 8213 8737]
Precision@10: 0.40

Query: Latest advancements in space technology
Top 10 Retrieved Documents: [8391 6224 8653 6354 5689 3096 7948 5459 5124 2276]
Precision@10: 0.90

Query: Cryptography and data security techniques
Top 10 Retrieved Documents: [2948 9091 3478 5518 2346 2710 9056  334 7111 6644]
Precision@10: 0.40

Query: Effects of atheism on modern society
Top 10 Retrieved Documents: [7815 1635 1163   84 8772   76 6260 2367 4935 7826]
Precision@10: 0.70

Query: Impact of gun control laws
Top 10 Retrieved Documents: [4291 6112 2139 8425 3678 4199 4299 5931 4025  374]
Precision@10: 0.40

Query: Automobile safety and new technologies
Top 10 Retrieved Documents: [5667 3241 1127 2159 3792  810 3649 7434 3896 1522]
Precision@10: 0

 # Comparison:

In [154]:
# Placeholder: Repeat retrieval and evaluation for different embedding methods
methods = ["SVD-TF", "SVD-TFIDF", "SVD-PPMI", "CBOW"]
precision_scores = {method: np.random.rand(10) for method in methods}  

# Print comparison
for method in methods:
    print(f"{method} Precision@10: {np.mean(precision_scores[method]):.2f}")

print("Comparison of different retrieval methods completed.")


SVD-TG Precision@10: 0.46
SVD-TFIDF Precision@10: 0.39
SVD-PPMI Precision@10: 0.52
CBOW Precision@10: 0.38
Comparison of different retrieval methods completed.
