In [1]:
# Import necessary libraries
from collections import defaultdict

# Define the documents
documents = [
    "I love cats . cats are cute pets.",
    "Dogs are loyal. Dogs are good friends.",
    "Birds can sing. Birds fly in the sky.",
    "Fish live underwater. Fish come in many colors."
]

In this section, we define and create unigram models for the documents. Unigrams are single words or terms, and a unigram model represents the probability distribution of individual terms in the document. The unigram_model function counts the occurrences of each term in a document, calculates the probabilities, and returns the unigram model. We create unigram models for all documents in the collection.

# Create Unigram Models

In [2]:
def unigram_model(document):
    words = document.split()
    total_words = len(words)
    unigram_counts = defaultdict(int)
    for word in words:
        unigram_counts[word] += 1
    unigram_model = {word: count / total_words for word, count in unigram_counts.items()}
    return unigram_model

In [3]:
# Create unigram models for all documents
unigram_models = [unigram_model(doc) for doc in documents]

In [4]:
unigram_models

[{'I': 0.125,
  'love': 0.125,
  'cats': 0.25,
  '.': 0.125,
  'are': 0.125,
  'cute': 0.125,
  'pets.': 0.125},
 {'Dogs': 0.2857142857142857,
  'are': 0.2857142857142857,
  'loyal.': 0.14285714285714285,
  'good': 0.14285714285714285,
  'friends.': 0.14285714285714285},
 {'Birds': 0.25,
  'can': 0.125,
  'sing.': 0.125,
  'fly': 0.125,
  'in': 0.125,
  'the': 0.125,
  'sky.': 0.125},
 {'Fish': 0.25,
  'live': 0.125,
  'underwater.': 0.125,
  'come': 0.125,
  'in': 0.125,
  'many': 0.125,
  'colors.': 0.125}]

In [5]:
#we have a query 
query = "I like cats and dogs"

In [8]:
def calculate_query_probability(query, document_model):
    # Tokenize the query into words
    query_words = query.split()
    
    # Initialize the probability for the entire query
    query_probability = 1.0
    
    # Calculate the probability for each term in the query
    for word in query_words:
        if word in document_model:
            query_probability *= document_model[word]
        else:
            query_probability = 0.0
            break
    
    return query_probability

In [9]:
query_probability = calculate_query_probability(query, unigram_models)

In [10]:
query_probability

0.0

## Your task

In [13]:
# Use Laplace Smoothing for this problem

# https://www.exploredatabase.com/2020/10/explain-add-1-laplace-smoothing-with-example.html

from collections import defaultdict

# Define the documents
documents = [
    "I love cats . cats are cute pets.",
    "Dogs are loyal. Dogs are good friends.",
    "Birds can sing. Birds fly in the sky.",
    "Fish live underwater. Fish come in many colors."
]

def laplace_smoothed_unigram_model(document):
    words = document.split()
    total_words = len(words)
    unigram_counts = defaultdict(int)
    
    # Initialize all words with a count of 1 (Laplace smoothing)
    for word in words:
        unigram_counts[word] += 1
    
    # Calculate Laplace-smoothed probabilities
    unigram_model = {word: (count + 1) / (total_words + len(unigram_counts)) for word, count in unigram_counts.items()}
    return unigram_model

# Create Laplace-smoothed unigram models for all documents
laplace_smoothed_unigram_models = [laplace_smoothed_unigram_model(doc) for doc in documents]

# Print Laplace-smoothed unigram models
for i, model in enumerate(laplace_smoothed_unigram_models):
    print(f"Document {i + 1} Laplace-Smoothed Unigram Model:")
    for word, probability in model.items():
        print(f"{word}: {probability:.4f}")
    print()

query = "I like cats and dogs"

def calculate_query_probability(query, document_model):
    query_words = query.split()
    query_probability = 1.0
    
    for word in query_words:
        # Apply Laplace smoothing for unseen words
        word_probability = document_model.get(word, 1 / (len(document_model) + len(vocabulary)))
        query_probability *= word_probability

    return query_probability

# Calculate query probability for each document
query_probabilities = []

for document_model in laplace_smoothed_unigram_models:
    probability = calculate_query_probability(query, document_model)
    query_probabilities.append(probability)

# Find the document with the highest probability for the query
most_probable_document_index = query_probabilities.index(max(query_probabilities))
most_probable_document = documents[most_probable_document_index]

print("Query Probabilities for Each Document:")
for i, probability in enumerate(query_probabilities):
    print(f"Document {i + 1}: {probability:.6f}")

print("The most probable document for the query is:", most_probable_document)

Document 1 Laplace-Smoothed Unigram Model:
I: 0.1333
love: 0.1333
cats: 0.2000
.: 0.1333
are: 0.1333
cute: 0.1333
pets.: 0.1333

Document 2 Laplace-Smoothed Unigram Model:
Dogs: 0.2500
are: 0.2500
loyal.: 0.1667
good: 0.1667
friends.: 0.1667

Document 3 Laplace-Smoothed Unigram Model:
Birds: 0.2000
can: 0.1333
sing.: 0.1333
fly: 0.1333
in: 0.1333
the: 0.1333
sky.: 0.1333

Document 4 Laplace-Smoothed Unigram Model:
Fish: 0.2000
live: 0.1333
underwater.: 0.1333
come: 0.1333
in: 0.1333
many: 0.1333
colors.: 0.1333

Query Probabilities for Each Document:
Document 1: 0.000001
Document 2: 0.000000
Document 3: 0.000000
Document 4: 0.000000
The most probable document for the query is: I love cats . cats are cute pets.


In [14]:
# create a bigram model & apply smoothing method

from collections import defaultdict

# Define the documents
documents = [
    "I love cats. Cats are cute pets.",
    "Dogs are loyal. Dogs are good friends.",
    "Birds can sing. Birds fly in the sky.",
    "Fish live underwater. Fish come in many colors."
]

# Define a function to calculate Laplace-smoothed bigram models
def laplace_smoothed_bigram_model(document):
    words = document.split()
    total_words = len(words)
    bigram_counts = defaultdict(int)
    
    # Initialize all bigrams with a count of 1 (Laplace smoothing)
    for i in range(total_words - 1):
        bigram = (words[i], words[i + 1])
        bigram_counts[bigram] += 1
    
    # Calculate Laplace-smoothed probabilities
    bigram_model = {bigram: (count + 1) / (total_words + total_words) for bigram, count in bigram_counts.items()}
    return bigram_model

# Create Laplace-smoothed bigram models for all documents
laplace_smoothed_bigram_models = [laplace_smoothed_bigram_model(doc) for doc in documents]

# Print Laplace-smoothed bigram models
for i, model in enumerate(laplace_smoothed_bigram_models):
    print(f"Document {i + 1} Laplace-Smoothed Bigram Model:")
    for bigram, probability in model.items():
        print(f"{bigram[0]} {bigram[1]}: {probability:.4f}")
    print()

Document 1 Laplace-Smoothed Bigram Model:
I love: 0.1429
love cats.: 0.1429
cats. Cats: 0.1429
Cats are: 0.1429
are cute: 0.1429
cute pets.: 0.1429

Document 2 Laplace-Smoothed Bigram Model:
Dogs are: 0.2143
are loyal.: 0.1429
loyal. Dogs: 0.1429
are good: 0.1429
good friends.: 0.1429

Document 3 Laplace-Smoothed Bigram Model:
Birds can: 0.1250
can sing.: 0.1250
sing. Birds: 0.1250
Birds fly: 0.1250
fly in: 0.1250
in the: 0.1250
the sky.: 0.1250

Document 4 Laplace-Smoothed Bigram Model:
Fish live: 0.1250
live underwater.: 0.1250
underwater. Fish: 0.1250
Fish come: 0.1250
come in: 0.1250
in many: 0.1250
many colors.: 0.1250



In [16]:
from collections import defaultdict

# Define the documents
documents = [
    "I love cats. Cats are cute pets.",
    "Dogs are loyal. Dogs are good friends.",
    "Birds can sing. Birds fly in the sky.",
    "Fish live underwater. Fish come in many colors."
]

# Define a function to calculate Laplace-smoothed bigram models
def laplace_smoothed_bigram_model(document):
    words = document.split()
    total_words = len(words)
    bigram_counts = defaultdict(int)
    
    # Initialize all bigrams with a count of 1 (Laplace smoothing)
    for i in range(total_words - 1):
        bigram = (words[i], words[i + 1])
        bigram_counts[bigram] += 1
    
    # Calculate Laplace-smoothed probabilities
    bigram_model = {bigram: (count + 1) / (total_words + total_words) for bigram, count in bigram_counts.items()}
    return bigram_model

# Create Laplace-smoothed bigram models for all documents
laplace_smoothed_bigram_models = [laplace_smoothed_bigram_model(doc) for doc in documents]

# Print Laplace-smoothed bigram models
for i, model in enumerate(laplace_smoothed_bigram_models):
    print(f"Document {i + 1} Laplace-Smoothed Bigram Model:")
    for bigram, probability in model.items():
        print(f"{bigram[0]} {bigram[1]}: {probability:.4f}")
    print()

# Define a query
query = "I like cats and dogs"

def calculate_query_probability(query, document_model):
    query_words = query.split()
    query_bigrams = [(query_words[i], query_words[i + 1]) for i in range(len(query_words) - 1)]
    
    query_probability = 1.0
    
    for bigram in query_bigrams:
        # Apply Laplace smoothing for unseen bigrams using the document_model
        bigram_probability = document_model.get(bigram, 1 / (len(document_model) + len(laplace_smoothed_bigram_models[0])))
        query_probability *= bigram_probability

    return query_probability

# Calculate query probability for each document
query_probabilities = []

for document_model in laplace_smoothed_bigram_models:
    probability = calculate_query_probability(query, document_model)
    query_probabilities.append(probability)

# Find the document with the highest probability for the query
most_probable_document_index = query_probabilities.index(max(query_probabilities))
most_probable_document = documents[most_probable_document_index]

print("Query Bigram Probabilities for Each Document:")
for i, probability in enumerate(query_probabilities):
    print(f"Document {i + 1}: {probability:.6f}")

print("The most probable document for the query is:", most_probable_document)

Document 1 Laplace-Smoothed Bigram Model:
I love: 0.1429
love cats.: 0.1429
cats. Cats: 0.1429
Cats are: 0.1429
are cute: 0.1429
cute pets.: 0.1429

Document 2 Laplace-Smoothed Bigram Model:
Dogs are: 0.2143
are loyal.: 0.1429
loyal. Dogs: 0.1429
are good: 0.1429
good friends.: 0.1429

Document 3 Laplace-Smoothed Bigram Model:
Birds can: 0.1250
can sing.: 0.1250
sing. Birds: 0.1250
Birds fly: 0.1250
fly in: 0.1250
in the: 0.1250
the sky.: 0.1250

Document 4 Laplace-Smoothed Bigram Model:
Fish live: 0.1250
live underwater.: 0.1250
underwater. Fish: 0.1250
Fish come: 0.1250
come in: 0.1250
in many: 0.1250
many colors.: 0.1250

Query Bigram Probabilities for Each Document:
Document 1: 0.000048
Document 2: 0.000068
Document 3: 0.000035
Document 4: 0.000035
The most probable document for the query is: Dogs are loyal. Dogs are good friends.
