In [1]:
# Import necessary libraries
from collections import defaultdict

# Define the documents
documents = [
    "I love cats . cats are cute pets.",
    "Dogs are loyal. Dogs are good friends.",
    "Birds can sing. Birds fly in the sky.",
    "Fish live underwater. Fish come in many colors."
]

In this section, we define and create unigram models for the documents. Unigrams are single words or terms, and a unigram model represents the probability distribution of individual terms in the document. The unigram_model function counts the occurrences of each term in a document, calculates the probabilities, and returns the unigram model. We create unigram models for all documents in the collection.

# Create Unigram Models

In [2]:
def unigram_model(document):
    words = document.split()
    total_words = len(words)
    unigram_counts = defaultdict(int)
    for word in words:
        unigram_counts[word] += 1
    unigram_model = {word: count / total_words for word, count in unigram_counts.items()}
    return unigram_model

In [3]:
# Create unigram models for all documents
unigram_models = [unigram_model(doc) for doc in documents]

In [4]:
unigram_models

[{'I': 0.125,
  'love': 0.125,
  'cats': 0.25,
  '.': 0.125,
  'are': 0.125,
  'cute': 0.125,
  'pets.': 0.125},
 {'Dogs': 0.2857142857142857,
  'are': 0.2857142857142857,
  'loyal.': 0.14285714285714285,
  'good': 0.14285714285714285,
  'friends.': 0.14285714285714285},
 {'Birds': 0.25,
  'can': 0.125,
  'sing.': 0.125,
  'fly': 0.125,
  'in': 0.125,
  'the': 0.125,
  'sky.': 0.125},
 {'Fish': 0.25,
  'live': 0.125,
  'underwater.': 0.125,
  'come': 0.125,
  'in': 0.125,
  'many': 0.125,
  'colors.': 0.125}]

In [5]:
#we have a query 
query = "I like cats and dogs"

In [6]:
def calculate_query_probability(query, document_model):
    # Tokenize the query into words
    query_words = query.split()
    
    # Initialize the probability for the entire query
    query_probability = 1.0
    
    # Calculate the probability for each term in the query
    for word in query_words:
        if word in document_model:
            query_probability *= document_model[word]
        else:
            query_probability = 0.0
            break
    
    return query_probability

In [7]:
query_probability = calculate_query_probability(query, unigram_models)

In [8]:
query_probability

0.0

## Your task

In [24]:
# Use Laplace Smoothing for this problem
def laplace_smoothing(documents, alpha=1):
    # Define a small constant for smoothing
    alpha = 1

    # Create a vocabulary set to keep track of unique words
    vocab = set()

    # Count the occurrences of each word in the documents
    word_counts = {}
    total_word_count = 0

    for document in documents:
        for word in document.split():
            word_counts[word] = word_counts.get(word, 0) + 1
            total_word_count += 1
            vocab.add(word)

    # Calculate probabilities with Laplace smoothing
    word_probabilities = {}

    for word in vocab:
        word_probabilities[word] = (word_counts.get(word, 0) + alpha) / (total_word_count + (alpha * len(vocab)))

    return word_probabilities


def search_documents(query, documents, word_probabilities):
    query_words = query.split()
    document_scores = {}

    for i, document in enumerate(documents):
        score = 1.0
        for word in query_words:
            if word in word_probabilities:
                score *= word_probabilities[word]
            else:
                # Laplace smoothing for unseen words
                score *= alpha / (total_word_count + (alpha * len(vocab)))

        document_scores[i] = score

    # Sort documents by score in descending order
    sorted_documents = sorted(document_scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_documents


# Example usage
documents = [
    "I love cats . cats are cute pets.",
    "Dogs are loyal. Dogs are good friends.",
    "Birds can sing. Birds fly in the sky.",
    "Fish live underwater. Fish come in many colors."
]

# Build word probabilities using Laplace smoothing
word_probabilities = laplace_smoothing(documents)

# Search for documents relevant to a query
query = "I love cats"
results = search_documents(query, documents, word_probabilities)

# Print the results
for doc_index, score in results:
    print(f'Document {doc_index+1} - Score: {score:.4f} - Content: {documents[doc_index]}')

# https://www.exploredatabase.com/2020/10/explain-add-1-laplace-smoothing-with-example.html


Document 1 - Score: 0.0001 - Content: I love cats . cats are cute pets.
Document 2 - Score: 0.0001 - Content: Dogs are loyal. Dogs are good friends.
Document 3 - Score: 0.0001 - Content: Birds can sing. Birds fly in the sky.
Document 4 - Score: 0.0001 - Content: Fish live underwater. Fish come in many colors.


In [26]:
# create a bigram model & apply smoothing method
def laplace_bigram_smoothing(documents, alpha=1):
    # Define a small constant for smoothing
    alpha = 1

    # Create a vocabulary set to keep track of unique words
    vocab = set()

    # Count the occurrences of each bigram in the documents
    bigram_counts = {}
    unigram_counts = {}

    for document in documents:
        words = document.split()
        for i in range(len(words)-1):
            bigram = (words[i], words[i+1])
            bigram_counts[bigram] = bigram_counts.get(bigram, 0) + 1
            unigram_counts[words[i]] = unigram_counts.get(words[i], 0) + 1
            vocab.add(words[i])

    # Calculate probabilities with Laplace smoothing
    bigram_probabilities = {}

    for bigram in bigram_counts:
        prev_word = bigram[0]
        bigram_probabilities[bigram] = (bigram_counts.get(bigram, 0) + alpha) / (unigram_counts.get(prev_word, 0) + (alpha * len(vocab)))

    return bigram_probabilities, vocab


def search_documents_bigram(query, documents, bigram_model):
    bigram_probabilities, vocab = bigram_model
    query_words = query.split()
    relevant_documents = []

    for i, document in enumerate(documents):
        document_probability = 1.0
        words = document.split()
        for j in range(len(words)-1):
            bigram = (words[j], words[j+1])
            if bigram in bigram_probabilities:
                document_probability *= bigram_probabilities[bigram]
            else:
                # Laplace smoothing for unseen bigrams
                document_probability *= alpha / (unigram_counts.get(prev_word, 0) + (alpha * len(vocab)))

        relevant_documents.append((document, document_probability))

    # Sort documents by probability in descending order
    relevant_documents.sort(key=lambda x: x[1], reverse=True)

    return relevant_documents


# Example usage
documents = [
    "I love cats . cats are cute pets.",
    "Dogs are loyal. Dogs are good friends.",
    "Birds can sing. Birds fly in the sky.",
    "Fish live underwater. Fish come in many colors."
]

# Build the bigram model with Laplace smoothing
bigram_model = laplace_bigram_smoothing(documents)

# Search for documents relevant to a query using bigram model
query = "I love cats"
results = search_documents_bigram(query, documents, bigram_model)

# Print the results
for document, probability in results:
    print(f'Probability: {probability:.4f} - Document: {document}')


Probability: 0.0000 - Document: Dogs are loyal. Dogs are good friends.
Probability: 0.0000 - Document: Birds can sing. Birds fly in the sky.
Probability: 0.0000 - Document: Fish live underwater. Fish come in many colors.
Probability: 0.0000 - Document: I love cats . cats are cute pets.


1