**Practical 2**

**Aim : Retrieval Models**
*   Implement the Boolean retrieval model and process queries.
*   Implement the vector space model with TF-IDF weighting and cosine similarity.


A. Implement the Boolean retrieval model and process queries.

In [None]:
print("T074 Kermeen")

# Import stopwords
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = set(stopwords.words('english'))

# Documents
docs = {
    1: "apple banana orange",
    2: "apple banana",
    3: "banana orange",
    4: "apple"
}

# Build inverted index (stopwords removed)
def build_index(docs):
    index = {}
    for doc_id, text in docs.items():
        for term in text.lower().split():
            if term not in stop:                         # remove stopwords
                index.setdefault(term, set()).add(doc_id)
    return index

# Boolean operations
def boolean_and(terms, index):
    result = index.get(terms[0], set())
    for t in terms[1:]: result &= index.get(t, set())
    return list(result)

def boolean_or(terms, index):
    result = set()
    for t in terms: result |= index.get(t, set())
    return list(result)

def boolean_not(term, index, total_docs):
    return list(set(range(1, total_docs+1)) - index.get(term, set()))

# Build index
inv_index = build_index(docs)

# Queries
print("Documents containing 'apple' AND 'banana':", boolean_and(["apple","banana"], inv_index))
print("Documents containing 'apple' OR 'orange':", boolean_or(["apple","orange"], inv_index))
print("Documents NOT containing 'orange':", boolean_not("orange", inv_index, len(docs)))

T074 Kermeen
Documents containing 'apple' AND 'banana': [1, 2]
Documents containing 'apple' OR 'orange': [1, 2, 3]
Documents NOT containing 'orange': [2, 4]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


B. Implement the vector space model with TF-IDF weighting and cosine
similarity

In [None]:
print("T074 Kermeen")

# Import libraries
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import nltk
from nltk.corpus import stopwords
import numpy as np
from numpy.linalg import norm

# Download stopwords
nltk.download('stopwords')
stop = stopwords.words('english')

# Documents
train = ["The sky is blue.", "The sun is bright."]
test = ["The sun in the sky is bright."]

# Create TF-IDF pipeline
vectorizer = CountVectorizer(stop_words=stop)
transformer = TfidfTransformer()

# Bag-of-Words vectors
train_bow = vectorizer.fit_transform(train).toarray()
test_bow = vectorizer.transform(test).toarray()

print("Train BoW:", train_bow)
print("Test BoW:", test_bow)

# Cosine similarity
cos = lambda a, b: round(np.inner(a, b) / (norm(a)*norm(b)), 3)

# Show vectors
for vec in train_bow: print(vec)
for tvec in test_bow:
    print(tvec)
    print(cos(vec, tvec))

# TF-IDF values
print("\nTrain TF-IDF:")
print(transformer.fit_transform(train_bow).toarray())

print("\nTest TF-IDF:")
print(transformer.fit_transform(test_bow).todense())

T074 Kermeen
Train BoW: [[1 0 1 0]
 [0 1 0 1]]
Test BoW: [[0 1 1 1]]
[1 0 1 0]
[0 1 0 1]
[0 1 1 1]
0.816

Train TF-IDF:
[[0.70710678 0.         0.70710678 0.        ]
 [0.         0.70710678 0.         0.70710678]]

Test TF-IDF:
[[0.         0.57735027 0.57735027 0.57735027]]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Question 1**

Implement the Boolean retrieval model for the following corpus.

Document 1: 'this is the first document.'

Document 2: 'this document is the second document.'

Document 3: 'And this is the third one.'

Document 4: 'Is this the first document?'

Process the query “first and third”.

In [None]:
print("T074 Kermeen")

# Documents
docs = {
    1: "this is the first document",
    2: "this document is the second document",
    3: "and this is the third one",
    4: "is this the first document"
}

# Build inverted index
index = {}
for doc_id, text in docs.items():
    for term in text.split():
        index.setdefault(term, set()).add(doc_id)

# Boolean AND
def boolean_and(terms):
    sets = [index.get(t, set()) for t in terms]
    return list(set.intersection(*sets)) if sets else []

# Query
result = boolean_and(["first", "third"])

print("Documents matching 'first AND third':", result)


T074 Kermeen
Documents matching 'first AND third': []


**Question 2**

Implement the Boolean retrieval model for the following corpus.

Document 1:The cat chased the dog around the garden.

Document2: She was sitting in the garden last night.

Document 3: I read the book the night before.

Process the query “garden or night”.

In [None]:
print("T074 Kermeen")

# Documents
docs = {
    1: "The cat chased the dog around the garden",
    2: "She was sitting in the garden last night",
    3: "I read the book the night before"
}

# Build inverted index
index = {}
for doc_id, text in docs.items():
    for word in text.lower().split():
        index.setdefault(word, set()).add(doc_id)

# Boolean OR
def boolean_or(terms):
    return list(set().union(*(index.get(t, set()) for t in terms)))

# Query
result = boolean_or(["garden", "night"])

print("Documents matching 'garden OR night':", result)


T074 Kermeen
Documents matching 'garden OR night': [1, 2, 3]


**Question 3**

Implement the Boolean retrieval model for the following corpus

Document 1:BSc lectures start at 7.

Document 2:My lectures are over.

Document 3: Today is a holiday.

Process the query “not lectures”

In [None]:
print("T074 Kermeen")

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop = set(stopwords.words('english'))

# Documents
docs = {
    1: "BSc lectures start at 7",
    2: "My lectures are over",
    3: "Today is a holiday"
}

# Build inverted index without stopwords
index = {}
for doc_id, text in docs.items():
    for word in text.lower().split():
        if word not in stop:
            index.setdefault(word, set()).add(doc_id)

# Boolean NOT
def boolean_not(term):
    return list(set(docs.keys()) - index.get(term, set()))

# Query
result = boolean_not("lectures")

print("Documents matching 'NOT lectures':", result)

T074 Kermeen
Documents matching 'NOT lectures': [3]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Question 4**

Implement the vector space model with TF-IDF weighting for the
following corpus

Document 1: "Document about python programming language and data
analysis."

Document 2: "Document discussing machine learning algorithms and
programming techniques."

Document3: "Overview of natural language processing and its
applications."

query = "python programming"

In [None]:
print("T074 Kermeen")

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import numpy as np
from numpy.linalg import norm

# Documents
docs = [
    "Document about python programming language and data analysis.",
    "Document discussing machine learning algorithms and programming techniques.",
    "Overview of natural language processing and its applications."
]

query = ["python programming"]

# Bag-of-Words with Auto Stopword Removal
vectorizer = CountVectorizer(stop_words='english')
transformer = TfidfTransformer()

# TF-IDF for documents
doc_tfidf = transformer.fit_transform(vectorizer.fit_transform(docs)).toarray()

# TF-IDF for query
query_tfidf = transformer.transform(vectorizer.transform(query)).toarray()

# Cosine similarity
cos = lambda a, b: round(np.inner(a, b) / (norm(a) * norm(b)), 3)

# Output
print("TF-IDF for documents:")
print(doc_tfidf)

print("\nTF-IDF for query:")
print(query_tfidf)

print("\nCosine Similarity Scores:")
for i, vec in enumerate(doc_tfidf, 1):
    print(f"Document {i}: {cos(vec, query_tfidf[0])}")

T074 Kermeen
TF-IDF for documents:
[[0.         0.45954803 0.         0.45954803 0.         0.34949812
  0.34949812 0.         0.         0.         0.         0.
  0.34949812 0.45954803 0.        ]
 [0.40301621 0.         0.         0.         0.40301621 0.30650422
  0.         0.40301621 0.40301621 0.         0.         0.
  0.30650422 0.         0.40301621]
 [0.         0.         0.46735098 0.         0.         0.
  0.35543247 0.         0.         0.46735098 0.46735098 0.46735098
  0.         0.         0.        ]]

TF-IDF for query:
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.60534851 0.79596054 0.        ]]

Cosine Similarity Scores:
Document 1: 0.577
Document 2: 0.186
Document 3: 0.0


**Question 5**

Implement the Boolean retrieval model for the following corpus

Document 1:The university exam is scheduled next week.

Document2: The university of mumbai has declared the result.

Process the query “university and Mumbai”.

In [None]:
print("T074 Kermeen")

# Stopwords list (short manual list for simplicity)
stopwords = {"the", "is", "has", "a", "of", "in", "at", "on"}

# Documents
docs = {
    1: "The university exam is scheduled next week",
    2: "The university of mumbai has declared the result"
}

# Build inverted index (remove stopwords)
def build_index(docs):
    index = {}
    for doc_id, text in docs.items():
        for term in text.lower().split():
            if term not in stopwords:        # remove stopwords
                index.setdefault(term, set()).add(doc_id)
    return index

# Boolean AND operation
def boolean_and(terms, index):
    result = index.get(terms[0], set())
    for term in terms[1:]:
        result &= index.get(term, set())
    return list(result)

# Build index
inverted_index = build_index(docs)

# Query: remove stopwords automatically
query = [t for t in ["university", "mumbai"] if t not in stopwords]

# Process query
result = boolean_and(query, inverted_index)

print("Documents matching 'university AND mumbai':", result)

T074 Kermeen
Documents matching 'university AND mumbai': [2]
