In [2]:
# Import necessary libraries
from collections import defaultdict

# Define the documents
documents = [
    "I love cats . cats are cute pets.",
    "Dogs are loyal. Dogs are good friends.",
    "Birds can sing. Birds fly in the sky.",
    "Fish live underwater. Fish come in many colors."
]

In this section, we define and create unigram models for the documents. Unigrams are single words or terms, and a unigram model represents the probability distribution of individual terms in the document. The unigram_model function counts the occurrences of each term in a document, calculates the probabilities, and returns the unigram model. We create unigram models for all documents in the collection.

# Create Unigram Models

In [3]:
def unigram_model(document):
    words = document.split()
    total_words = len(words)
    unigram_counts = defaultdict(int)
    for word in words:
        unigram_counts[word] += 1
    unigram_model = {word: count / total_words for word, count in unigram_counts.items()}
    return unigram_model

In [4]:
# Create unigram models for all documents
unigram_models = [unigram_model(doc) for doc in documents]

In [5]:
unigram_models

[{'I': 0.125,
  'love': 0.125,
  'cats': 0.25,
  '.': 0.125,
  'are': 0.125,
  'cute': 0.125,
  'pets.': 0.125},
 {'Dogs': 0.2857142857142857,
  'are': 0.2857142857142857,
  'loyal.': 0.14285714285714285,
  'good': 0.14285714285714285,
  'friends.': 0.14285714285714285},
 {'Birds': 0.25,
  'can': 0.125,
  'sing.': 0.125,
  'fly': 0.125,
  'in': 0.125,
  'the': 0.125,
  'sky.': 0.125},
 {'Fish': 0.25,
  'live': 0.125,
  'underwater.': 0.125,
  'come': 0.125,
  'in': 0.125,
  'many': 0.125,
  'colors.': 0.125}]

In [6]:
#we have a query 
query = "I like cats and dogs"

In [7]:
def calculate_query_probability(query, document_model):
    # Tokenize the query into words
    query_words = query.split()
    
    # Initialize the probability for the entire query
    query_probability = 1.0
    
    # Calculate the probability for each term in the query
    for word in query_words:
        if word in document_model:
            query_probability *= document_model[word]
        else:
            query_probability = 0.0
            break
    
    return query_probability

In [20]:
query_probability = calculate_query_probability(query, unigram_models)

In [21]:
query_probability

0.0

## Your task

In [1]:
# Use Laplace Smoothing for this problem

# https://www.exploredatabase.com/2020/10/explain-add-1-laplace-smoothing-with-example.html
def calculate_query_probabilities(query, document_models):
    query_probabilities = []
    
    for document_model in document_models:
        query_probability = calculate_query_probability(query, document_model)
        query_probabilities.append(query_probability)
    
    return query_probabilities

In [10]:
document_models = create_document_models(documents)

query_probabilities = calculate_query_probabilities(query, document_models)

for i, probability in enumerate(query_probabilities):
    print("Document", i+1, "Query Probability:", probability)

Document 1 Query Probability: Ellipsis


In [9]:
# Define the documents
documents = [
    "I love cats . cats are cute pets.",
    "Dogs are loyal. Dogs are good friends.",
    "Birds can sing. Birds fly in the sky.",
    "Fish live underwater. Fish come in many colors."
]

def preprocess(document):
    # Preprocess the document by tokenizing and removing punctuation
    # You can customize this function based on your requirements
    document = document.lower()
    document = document.replace(".", " .")
    tokens = document.split()
    return tokens

def create_bigram_model(documents, k=1):
    # Step 2: Count the bigram occurrences
    bigram_counts = defaultdict(lambda: defaultdict(int))
    for document in documents:
        tokens = preprocess(document)
        for i in range(len(tokens) - 1):
            current_word = tokens[i]
            next_word = tokens[i + 1]
            bigram_counts[current_word][next_word] += 1

    # Step 3: Compute the bigram probabilities
    bigram_probabilities = defaultdict(lambda: defaultdict(float))
    for current_word, next_words in bigram_counts.items():
        total_count = sum(next_words.values())
        for next_word, count in next_words.items():
            bigram_probabilities[current_word][next_word] = (count + k) / (total_count + k * len(next_words))

    return bigram_probabilities

# Example usage
bigram_model = create_bigram_model(documents, k=0.5)

# Print the bigram probabilities
for current_word, next_words in bigram_model.items():
    for next_word, probability in next_words.items():
        print(f"Bigram: {current_word} {next_word}, Probability: {probability}")

Bigram: i love, Probability: 1.0
Bigram: love cats, Probability: 1.0
Bigram: cats ., Probability: 0.5
Bigram: cats are, Probability: 0.5
Bigram: . cats, Probability: 0.25
Bigram: . dogs, Probability: 0.25
Bigram: . birds, Probability: 0.25
Bigram: . fish, Probability: 0.25
Bigram: are cute, Probability: 0.3333333333333333
Bigram: are loyal, Probability: 0.3333333333333333
Bigram: are good, Probability: 0.3333333333333333
Bigram: cute pets, Probability: 1.0
Bigram: pets ., Probability: 1.0
Bigram: dogs are, Probability: 1.0
Bigram: loyal ., Probability: 1.0
Bigram: good friends, Probability: 1.0
Bigram: friends ., Probability: 1.0
Bigram: birds can, Probability: 0.5
Bigram: birds fly, Probability: 0.5
Bigram: can sing, Probability: 1.0
Bigram: sing ., Probability: 1.0
Bigram: fly in, Probability: 1.0
Bigram: in the, Probability: 0.5
Bigram: in many, Probability: 0.5
Bigram: the sky, Probability: 1.0
Bigram: sky ., Probability: 1.0
Bigram: fish live, Probability: 0.5
Bigram: fish come, P