In [1]:
# Import necessary libraries
from collections import defaultdict

# Define the documents
documents = [
    "I love cats . cats are cute pets.",
    "Dogs are loyal. Dogs are good friends.",
    "Birds can sing. Birds fly in the sky.",
    "Fish live underwater. Fish come in many colors."
]

In this section, we define and create unigram models for the documents. Unigrams are single words or terms, and a unigram model represents the probability distribution of individual terms in the document. The unigram_model function counts the occurrences of each term in a document, calculates the probabilities, and returns the unigram model. We create unigram models for all documents in the collection.

# Create Unigram Models

In [2]:
def unigram_model(document):
    words = document.split()
    total_words = len(words)
    unigram_counts = defaultdict(int)
    for word in words:
        unigram_counts[word] += 1
    unigram_model = {word: count / total_words for word, count in unigram_counts.items()}
    return unigram_model

In [3]:
# Create unigram models for all documents
unigram_models = [unigram_model(doc) for doc in documents]

In [4]:
unigram_models

[{'I': 0.125,
  'love': 0.125,
  'cats': 0.25,
  '.': 0.125,
  'are': 0.125,
  'cute': 0.125,
  'pets.': 0.125},
 {'Dogs': 0.2857142857142857,
  'are': 0.2857142857142857,
  'loyal.': 0.14285714285714285,
  'good': 0.14285714285714285,
  'friends.': 0.14285714285714285},
 {'Birds': 0.25,
  'can': 0.125,
  'sing.': 0.125,
  'fly': 0.125,
  'in': 0.125,
  'the': 0.125,
  'sky.': 0.125},
 {'Fish': 0.25,
  'live': 0.125,
  'underwater.': 0.125,
  'come': 0.125,
  'in': 0.125,
  'many': 0.125,
  'colors.': 0.125}]

In [5]:
#we have a query 
query = "I like cats and dogs"

In [6]:
def calculate_query_probability(query, document_model):
    # Tokenize the query into words
    query_words = query.split()
    
    # Initialize the probability for the entire query
    query_probability = 1.0
    
    # Calculate the probability for each term in the query
    for word in query_words:
        if word in document_model:
            query_probability *= document_model[word]
        else:
            query_probability = 0.0
            break
    
    return query_probability

In [26]:
query_probability = calculate_query_probability(query, unigram_models)

In [8]:
query_probability

0.0

## Your task

In [43]:
# Use Laplace Smoothing for this problem

# https://www.exploredatabase.com/2020/10/explain-add-1-laplace-smoothing-with-example.html
def unigram_model(document):
    words = document.split()
    total_words = len(words)
    unigram_counts = defaultdict(int)
    for word in words:
        unigram_counts[word] += 1
    unigram_model = {word: (count + 1) / (total_words + len(unigram_counts)) for word, count in unigram_counts.items()}
    return unigram_model
unigram_models = [unigram_model(doc) for doc in documents]



In [44]:
unigram_models

[{'I': 0.13333333333333333,
  'love': 0.13333333333333333,
  'cats': 0.2,
  '.': 0.13333333333333333,
  'are': 0.13333333333333333,
  'cute': 0.13333333333333333,
  'pets.': 0.13333333333333333},
 {'Dogs': 0.25,
  'are': 0.25,
  'loyal.': 0.16666666666666666,
  'good': 0.16666666666666666,
  'friends.': 0.16666666666666666},
 {'Birds': 0.2,
  'can': 0.13333333333333333,
  'sing.': 0.13333333333333333,
  'fly': 0.13333333333333333,
  'in': 0.13333333333333333,
  'the': 0.13333333333333333,
  'sky.': 0.13333333333333333},
 {'Fish': 0.2,
  'live': 0.13333333333333333,
  'underwater.': 0.13333333333333333,
  'come': 0.13333333333333333,
  'in': 0.13333333333333333,
  'many': 0.13333333333333333,
  'colors.': 0.13333333333333333}]

In [45]:
query = "I like cats and dogs"

In [46]:
def calculate_query_probability(query, document_models, smoothing_factor=1):
    query_probability = 1.0
    
    for document_model in document_models:
        document_probability = 1.0
        total_words = sum(document_model.values())  # Total word count in the document_model
        for word in query.split():
            if word in document_model:
                document_probability *= document_model[word]
            else:
                vocabulary_size = len(document_model)
                document_probability *= smoothing_factor / (total_words + smoothing_factor * vocabulary_size)
        query_probability *= document_probability

    return query_probability


In [47]:
query_probability = calculate_query_probability(query, unigram_models, smoothing_factor=1)
print("Query Probability with Laplace Smoothing:", query_probability)

Query Probability with Laplace Smoothing: 6.237960917509796e-18


In [52]:
from collections import defaultdict

def create_bigram_model(documents, smoothing_factor=1):
    # Tokenize the documents and create a list of words
    words = [document.split() for document in documents]
    
    # Initialize counts for bigrams and vocabulary size
    bigram_counts = defaultdict(int)
    vocab_size = 0
    
    # Count the bigrams and vocabulary size
    for document in words:
        for i in range(len(document) - 1):
            bigram = (document[i], document[i + 1])
            bigram_counts[bigram] += 1
            vocab_size += 1

    bigram_model = {}
    for bigram, count in bigram_counts.items():
        preceding_word = bigram[0]
        bigram_model[bigram] = (count + smoothing_factor) / (words.count(preceding_word) + (smoothing_factor * vocab_size))
    
    return bigram_model

# Create a bigram model for the documents
bigram_model = create_bigram_model(documents)

# Print the probability of a specific bigram, e.g., ("I", "love")
probability = bigram_model.get(("I", "love"), 0.0)
print("Probability of ('I', 'love'): {:.4f}".format(probability))


Probability of ('I', 'love'): 0.0741
