In [28]:
# Import necessary libraries
from collections import defaultdict

# Define the documents
documents = [
    "I love cats . cats are cute pets.",
    "Dogs are loyal. Dogs are good friends.",
    "Birds can sing. Birds fly in the sky.",
    "Fish live underwater. Fish come in many colors."
]

In this section, we define and create unigram models for the documents. Unigrams are single words or terms, and a unigram model represents the probability distribution of individual terms in the document. The unigram_model function counts the occurrences of each term in a document, calculates the probabilities, and returns the unigram model. We create unigram models for all documents in the collection.

# Create Unigram Models

In [29]:
def unigram_model(document):
    words = document.split()
    total_words = len(words)
    unigram_counts = defaultdict(int)
    for word in words:
        unigram_counts[word] += 1
    unigram_model = {word: count / total_words for word, count in unigram_counts.items()}
    return unigram_model

In [30]:
# Create unigram models for all documents
unigram_models = [unigram_model(doc) for doc in documents]

In [31]:
unigram_models

[{'I': 0.125,
  'love': 0.125,
  'cats': 0.25,
  '.': 0.125,
  'are': 0.125,
  'cute': 0.125,
  'pets.': 0.125},
 {'Dogs': 0.2857142857142857,
  'are': 0.2857142857142857,
  'loyal.': 0.14285714285714285,
  'good': 0.14285714285714285,
  'friends.': 0.14285714285714285},
 {'Birds': 0.25,
  'can': 0.125,
  'sing.': 0.125,
  'fly': 0.125,
  'in': 0.125,
  'the': 0.125,
  'sky.': 0.125},
 {'Fish': 0.25,
  'live': 0.125,
  'underwater.': 0.125,
  'come': 0.125,
  'in': 0.125,
  'many': 0.125,
  'colors.': 0.125}]

In [32]:
unigram_model

<function __main__.unigram_model(document)>

In [33]:
#we have a query 
query = "I like cats and dogs"

In [34]:
def calculate_query_probability(query, document_model):
    # Tokenize the query into words
    query_words = query.split()
    
    # Initialize the probability for the entire query
    query_probability = 1.0
    
    # Calculate the probability for each term in the query
    for word in query_words:
        if word in document_model:
            query_probability *= document_model[word]
        else:
            query_probability = 0.0
            break
    
    return query_probability

In [35]:
query_probability = calculate_query_probability(query, unigram_models)

In [36]:
query_probability

0.0

## Your task

In [37]:
#find the vocabulary size

def vocab(documents):
    words = documents.split()
    vocab = []
    for word in words:
        if words not in vocab:
            vocab.append(word)
    
    return vocab


unique_words = [vocab(doc) for doc in documents]
    

In [38]:
unique_words

[['I', 'love', 'cats', '.', 'cats', 'are', 'cute', 'pets.'],
 ['Dogs', 'are', 'loyal.', 'Dogs', 'are', 'good', 'friends.'],
 ['Birds', 'can', 'sing.', 'Birds', 'fly', 'in', 'the', 'sky.'],
 ['Fish', 'live', 'underwater.', 'Fish', 'come', 'in', 'many', 'colors.']]

In [39]:
# Use Laplace Smoothing for this problem

#https://www.exploredatabase.com/2020/10/explain-add-1-laplace-smoothing-with-example.html

def unigram_laplace(document):
    words = document.split()
    total_words = len(words)
    unigram_counts = defaultdict(int)
    for word in words:
        unigram_counts[word] += 1
    unigram_model = {word: ((count + 1) / (total_words + len(unigram_counts))) for word, count in unigram_counts.items()}
    return unigram_model


unigram_laplace = [unigram_laplace(doc) for doc in documents]

In [40]:
unigram_laplace

[{'I': 0.13333333333333333,
  'love': 0.13333333333333333,
  'cats': 0.2,
  '.': 0.13333333333333333,
  'are': 0.13333333333333333,
  'cute': 0.13333333333333333,
  'pets.': 0.13333333333333333},
 {'Dogs': 0.25,
  'are': 0.25,
  'loyal.': 0.16666666666666666,
  'good': 0.16666666666666666,
  'friends.': 0.16666666666666666},
 {'Birds': 0.2,
  'can': 0.13333333333333333,
  'sing.': 0.13333333333333333,
  'fly': 0.13333333333333333,
  'in': 0.13333333333333333,
  'the': 0.13333333333333333,
  'sky.': 0.13333333333333333},
 {'Fish': 0.2,
  'live': 0.13333333333333333,
  'underwater.': 0.13333333333333333,
  'come': 0.13333333333333333,
  'in': 0.13333333333333333,
  'many': 0.13333333333333333,
  'colors.': 0.13333333333333333}]

In [41]:
unigram_laplace[0].keys()
unique_word_len = sum(unigram_laplace[0].values())
# unique_word_len = (unigram_laplace[0].values())
print(unique_word_len)

0.9999999999999999


In [50]:


def calculate_query_probab_laplace(query, document_model, document):
    # Tokenize the query into words
    query_words = query.split()

    N = len(document.split())
    # print(N)
    
    unique_word_len = len(document_model.keys())
    
    # Initialize the probability for the entire query
    query_probability = 1.0
    
    # Calculate the probability for each term in the query
    for word in query_words:
        if word in document_model:
            query_probability *= document_model[word]
        else:
            query_probability = 1/(N + unique_word_len)
            break
    
    return query_probability

# query1 = calculate_query_probab_laplace(query, unigram_laplace[0], documents[0])

In [51]:
# query1
for i in range(len(documents)-1):
    query1 = calculate_query_probab_laplace(query, unigram_laplace[i], documents[i])
    print("Document: ", i+1, ": ", query1)

Document:  1 :  0.06666666666666667
Document:  2 :  0.08333333333333333
Document:  3 :  0.06666666666666667


In [44]:
words = documents[0].split()
count = words.count('cats')
count

2

In [45]:
# create a bigram model & apply smoothing method
def bigram_laplace(document):
    words = document.split()
    total_words = len(words)
    bigram_counts = defaultdict(int)
    for i in range(len(words)-1):
        bigram_counts[(words[i], words[i+1])] += 1
        #print(bigram_counts)
    for i in range(len(words)-1):
        word =[words[i], words[i+1]]
        # word_count = words.count(word[1])
        bigram_model = {word: (count + 1) / (words.count(word[1]) + len(bigram_counts)) for word, count in bigram_counts.items()}
    return bigram_model

In [46]:
bigrams = [bigram_laplace(doc) for doc in documents]
# bigrams = [bigram_laplace(documents[0])]
bigrams

[{('I', 'love'): 0.25,
  ('love', 'cats'): 0.2222222222222222,
  ('cats', '.'): 0.25,
  ('.', 'cats'): 0.2222222222222222,
  ('cats', 'are'): 0.25,
  ('are', 'cute'): 0.25,
  ('cute', 'pets.'): 0.25},
 {('Dogs', 'are'): 0.42857142857142855,
  ('are', 'loyal.'): 0.3333333333333333,
  ('loyal.', 'Dogs'): 0.2857142857142857,
  ('are', 'good'): 0.3333333333333333,
  ('good', 'friends.'): 0.3333333333333333},
 {('Birds', 'can'): 0.25,
  ('can', 'sing.'): 0.25,
  ('sing.', 'Birds'): 0.2222222222222222,
  ('Birds', 'fly'): 0.25,
  ('fly', 'in'): 0.25,
  ('in', 'the'): 0.25,
  ('the', 'sky.'): 0.25},
 {('Fish', 'live'): 0.25,
  ('live', 'underwater.'): 0.25,
  ('underwater.', 'Fish'): 0.2222222222222222,
  ('Fish', 'come'): 0.25,
  ('come', 'in'): 0.25,
  ('in', 'many'): 0.25,
  ('many', 'colors.'): 0.25}]

In [47]:
def calc_query_probab_bigram_laplace(query, document_model,document):
    # Tokenize the query into words
    query_words = query.split()
    docs = document.split()
    word_count = len(docs)

    # Initialize the probability for the entire query
    query_probability = 1.0
    
    # Calculate the probability for each term in the query
    for i in range(len(query_words)-1):
        word = [query_words[i], query_words[i+1]]
        if word in document_model:
            query_probability *= document_model[word]
        else:
            count = docs.count(word[1])
            query_probability *= 1/(count + len(document_model)) 
    
    return query_probability

query2 = calculate_query_probab_laplace(query, bigrams[0], documents[0])
print(query2)

0.06666666666666667


In [48]:
for i in range(len(documents)-1):
    query2 = calculate_query_probab_laplace(query, bigrams[0], documents[0])
    print("Document ", i + 1, ": ", query2)

Document  1 :  0.06666666666666667
Document  2 :  0.06666666666666667
Document  3 :  0.06666666666666667
