In [1]:
import re
from collections import defaultdict

# -------------------------------
# Step 1: Sample documents
# -------------------------------
documents = {
    1: "Artificial intelligence and machine learning are closely related fields.",
    2: "Deep learning is a subset of machine learning.",
    3: "AI and data science often overlap with machine learning concepts."
}

print("Original Documents:")
for doc_id, content in documents.items():
    print(f"Doc {doc_id}: {content}")


Original Documents:
Doc 1: Artificial intelligence and machine learning are closely related fields.
Doc 2: Deep learning is a subset of machine learning.
Doc 3: AI and data science often overlap with machine learning concepts.


In [4]:
# -------------------------------
# Step 2: Preprocess text (simple tokenization + cleaning)
# -------------------------------

def preprocess(text):
    text = text.lower()                          # lowercase
    text = re.sub(r'[^a-z\s]', '', text)         # remove punctuation/numbers
    tokens = text.split()                        # tokenize by space
    return tokens


In [5]:
# -------------------------------
# Step 3: Build inverted index
# -------------------------------

inverted_index = defaultdict(set)  # word → set of document IDs

for doc_id, content in documents.items():
    words = preprocess(content)
    for word in words:
        inverted_index[word].add(doc_id)

print("Inverted Index:")
for word, doc_ids in inverted_index.items():
    print(f"{word}: {sorted(list(doc_ids))}")


Inverted Index:
artificial: [1]
intelligence: [1]
and: [1, 3]
machine: [1, 2, 3]
learning: [1, 2, 3]
are: [1]
closely: [1]
related: [1]
fields: [1]
deep: [2]
is: [2]
a: [2]
subset: [2]
of: [2]
ai: [3]
data: [3]
science: [3]
often: [3]
overlap: [3]
with: [3]
concepts: [3]


In [6]:
def search(query):
    query_words = preprocess(query)
    result_sets = []

    for word in query_words:
        if word in inverted_index:
            result_sets.append(inverted_index[word])

    if result_sets:
        result_docs = set.intersection(*result_sets)  # AND search
    else:
        result_docs = set()

    return result_docs


In [8]:
queries = ["machine learning", "data science", "deep learning" , "artificial intelligence"]
for q in queries:
    result = search(q)
    print(f"\nQuery: '{q}' → Documents found: {sorted(list(result)) if result else 'None'}")



Query: 'machine learning' → Documents found: [1, 2, 3]

Query: 'data science' → Documents found: [3]

Query: 'deep learning' → Documents found: [2]

Query: 'artificial intelligence' → Documents found: [1]
