In [1]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
document_1 = [
    "Cats love to chase mice",
    "Dogs bark loud at night",
    "Mice run from cats fast",
    "Birds fly in the sky",
    "Cats sleep during the day",
    "Dogs chase cats often outside",
    "Cats and dogs are pets",
    "Mice eat cheese at night",
    "Dogs bark at strangers",
    "Cats purr when they sleep"
]

document_2 = [
    "Cars move fast on roads",
    "Buses stop at stations",
    "Roads connect cities and towns",
    "Trains run on electricity",
    "Bikes are faster in traffic",
    "Traffic jams cause delays",
    "Buses carry many people",
    "Trains reach faster than buses",
    "Roads are repaired after rain",
    "Cars park outside buildings"
]

In [4]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [6]:
def preprocess(text):
    text = text.lower()  # Normalize
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = text.split()  # Tokenize
    return [stemmer.stem(word) for word in words if word not in stop_words]  # Remove stopwords and stem

In [20]:
processed_doc1 = [preprocess(doc) for doc in document_1]
processed_doc2 = [preprocess(doc) for doc in document_2]

In [18]:
for i, doc in enumerate(processed_doc1):
    print(f"Doc {i+1}:", doc)

Doc 1: ['cat', 'love', 'chase', 'mice']
Doc 2: ['dog', 'bark', 'loud', 'night']
Doc 3: ['mice', 'run', 'cat', 'fast']
Doc 4: ['bird', 'fli', 'sky']
Doc 5: ['cat', 'sleep', 'day']
Doc 6: ['dog', 'chase', 'cat', 'often', 'outsid']
Doc 7: ['cat', 'dog', 'pet']
Doc 8: ['mice', 'eat', 'chees', 'night']
Doc 9: ['dog', 'bark', 'stranger']
Doc 10: ['cat', 'purr', 'sleep']


In [21]:
for i, doc in enumerate(processed_doc2):
    print(f"Doc {i+1}:", doc)

Doc 1: ['car', 'move', 'fast', 'road']
Doc 2: ['buse', 'stop', 'station']
Doc 3: ['road', 'connect', 'citi', 'town']
Doc 4: ['train', 'run', 'electr']
Doc 5: ['bike', 'faster', 'traffic']
Doc 6: ['traffic', 'jam', 'caus', 'delay']
Doc 7: ['buse', 'carri', 'mani', 'peopl']
Doc 8: ['train', 'reach', 'faster', 'buse']
Doc 9: ['road', 'repair', 'rain']
Doc 10: ['car', 'park', 'outsid', 'build']


Term document matrix for doc1

In [25]:
# Create a sorted list of all unique terms (vocabulary)
vocab = sorted(set(word for doc in processed_doc1 for word in doc))

# Create a binary term-document matrix (DataFrame)
td_matrix_doc1 = pd.DataFrame(0, index=range(len(document_1)), columns=vocab)

# Fill matrix: 1 if term exists in document, else 0
for i, doc in enumerate(processed_doc1):
    for word in doc:
        td_matrix_doc1.at[i, word] = 1

# Display the matrix
td_matrix_doc1

Unnamed: 0,bark,bird,cat,chase,chees,day,dog,eat,fast,fli,...,mice,night,often,outsid,pet,purr,run,sky,sleep,stranger
0,0,0,1,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,1,0,...,1,0,0,0,0,0,1,0,0,0
3,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5,0,0,1,1,0,0,1,0,0,0,...,0,0,1,1,0,0,0,0,0,0
6,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
7,0,0,0,0,1,0,0,1,0,0,...,1,1,0,0,0,0,0,0,0,0
8,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0


Term document matrix for document 2

In [26]:
# Create a sorted list of all unique terms (vocabulary)
vocab = sorted(set(word for doc in processed_doc2 for word in doc))

# Create a binary term-document matrix (DataFrame)
td_matrix_doc2 = pd.DataFrame(0, index=range(len(document_2)), columns=vocab)

# Fill matrix: 1 if term exists in document, else 0
for i, doc in enumerate(processed_doc2):
    for word in doc:
        td_matrix_doc2.at[i, word] = 1

# Display the matrix
td_matrix_doc2

Unnamed: 0,bike,build,buse,car,carri,caus,citi,connect,delay,electr,...,rain,reach,repair,road,run,station,stop,town,traffic,train
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
2,0,0,0,0,0,0,1,1,0,0,...,0,0,0,1,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,1
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
6,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
8,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1,0,0,0,0,0,0
9,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


##Enter input and search for doc 1

In [29]:
# Helper to get vector for a term
def get_term_vector(term):
    term = stemmer.stem(term.lower())
    if term in td_matrix_doc1.columns:
        return td_matrix_doc1[term]
    else:
        return pd.Series([0] * len(td_matrix_doc1))

def parse_query(query):
    query = query.replace("AND", "&").replace("OR", "|").replace("NOT", "~")
    tokens = query.replace("(", " ( ").replace(")", " ) ").split()
    parsed = []
    for token in tokens:
        if token in {"&", "|", "~", "(", ")"}:
            parsed.append(token)
        else:
            parsed.append(f"get_term_vector('{token.lower()}')")
    return " ".join(parsed)

# User Input and Evaluation
user_query = input("Enter Boolean query (e.g., (cats AND chase) OR (dogs)):\n")
parsed_query = parse_query(user_query)

# Evaluate Query
try:
    result = eval(parsed_query)
    matching_docs = result[result == 1].index.tolist()

    print("\nMatching Documents:\n")
    for idx in matching_docs:
        print(f"Doc {idx+1}: {document_1[idx]}")
except Exception as e:
    print("Error in query:", e)

Enter Boolean query (e.g., (cats AND chase) OR (dogs)):
(cats AND chase) OR (dogs)

Matching Documents:

Doc 1: Cats love to chase mice
Doc 2: Dogs bark loud at night
Doc 6: Dogs chase cats often outside
Doc 7: Cats and dogs are pets
Doc 9: Dogs bark at strangers


##Enter input and search for doc2

In [30]:
# Helper to get vector for a term
def get_term_vector(term):
    term = stemmer.stem(term.lower())
    if term in td_matrix_doc2.columns:
        return td_matrix_doc2[term]
    else:
        return pd.Series([0] * len(td_matrix_doc2))

def parse_query(query):
    query = query.replace("AND", "&").replace("OR", "|").replace("NOT", "~")
    tokens = query.replace("(", " ( ").replace(")", " ) ").split()
    parsed = []
    for token in tokens:
        if token in {"&", "|", "~", "(", ")"}:
            parsed.append(token)
        else:
            parsed.append(f"get_term_vector('{token.lower()}')")
    return " ".join(parsed)

# User Input and Evaluation
user_query = input("Enter Boolean query (e.g., (cars AND roads) OR (buses)):\n")
parsed_query = parse_query(user_query)

# Evaluate Query
try:
    result = eval(parsed_query)
    matching_docs = result[result == 1].index.tolist()

    print("\nMatching Documents:\n")
    for idx in matching_docs:
        print(f"Doc {idx+1}: {document_2[idx]}")
except Exception as e:
    print("Error in query:", e)

Enter Boolean query (e.g., (cars AND roads) OR (buses)):
(cars AND roads) OR (buses)

Matching Documents:

Doc 1: Cars move fast on roads
Doc 2: Buses stop at stations
Doc 7: Buses carry many people
Doc 8: Trains reach faster than buses


##Postlab

# Advantages and Disadvantages of Boolean Model

## ✅ Advantages of the Boolean Model

- **Simple and Intuitive**: The model is easy to understand and use, especially for users familiar with Boolean logic.
- **Precise Control**: Users can precisely define the criteria for relevant documents using `AND`, `OR`, and `NOT`.
- **Fast Retrieval**: Retrieval is fast because the model uses binary matching (presence or absence of terms).
- **Deterministic Output**: The model gives a clear yes/no answer — a document either matches the query or it doesn’t.
- **No Need for Training Data**: It does not require any learning or training phase, making it quick to implement.

## ❌ Disadvantages of the Boolean Model

- **Rigid Matching**: Documents are retrieved only if they **exactly match** the Boolean condition. This can miss relevant documents that are phrased differently.
- **No Ranking of Results**: All matching documents are treated equally; there's no concept of "more relevant" or "less relevant."
- **Complex Queries Are Hard to Build**: Users need to understand and correctly formulate Boolean expressions, which can become complicated.
- **Insensitive to Term Frequency**: It doesn’t consider how many times a term appears, only whether it appears at all.

