# No Laplace ngram

In [26]:
def sentence_probability(corpus, sentence):
    list_len = len(corpus)
    print(f"Corpus length: {list_len}")
    print("===== STEP 1: TOKENIZE CORPUS =====")
    tokenized_corpus = []
    for sent in corpus:
        tokens = sent.lower().split()
        tokenized_corpus.append(tokens)
        print(f"Sentence: '{sent}' -> Tokens: {tokens}")

    print("\n===== STEP 2: COUNT UNIGRAMS =====")
    unigram_counts = {}
    total_unigrams = 0

    for sent in tokenized_corpus:
        for word in sent:
            unigram_counts[word] = unigram_counts.get(word, 0) + 1
            total_unigrams += 1

    for word, count in unigram_counts.items():
        print(f"Unigram '{word}': {count}")

    print(f"Total unigrams in corpus: {total_unigrams}")

    print("\n===== STEP 3: COUNT BIGRAMS =====")
    bigram_counts = {}

    for sent in tokenized_corpus:
        for i in range(len(sent) - 1):
            bigram = (sent[i], sent[i + 1])
            bigram_counts[bigram] = bigram_counts.get(bigram, 0) + 1

    for bigram, count in bigram_counts.items():
        print(f"Bigram {bigram}: {count}")

    print("\n===== STEP 4: TOKENIZE INPUT SENTENCE =====")
    sentence_tokens = sentence.lower().split()
    print(f"Input sentence tokens: {sentence_tokens}")

    print("\n===== STEP 5: CALCULATE PROBABILITY =====")

    probability = 1.0

    # P(w1)
    first_word = sentence_tokens[0]
    if first_word not in unigram_counts:
        print(f"P({first_word}) = 0 (word not in corpus)")
        return 0.0

    p_first = unigram_counts[first_word] / list_len
    probability *= p_first

    print(f"P({first_word}) = {unigram_counts[first_word]} / {list_len} = {p_first}")

    # Conditional probabilities
    for i in range(len(sentence_tokens) - 1):
        prev_word = sentence_tokens[i]
        curr_word = sentence_tokens[i + 1]
        bigram = (prev_word, curr_word)

        if bigram not in bigram_counts:
            print(f"P({curr_word} | {prev_word}) = 0 (bigram not found)")
            return 0.0

        p_conditional = bigram_counts[bigram] / unigram_counts[prev_word]
        probability *= p_conditional

        print(
            f"P({curr_word} | {prev_word}) = "
            f"{bigram_counts[bigram]} / {unigram_counts[prev_word]} = {p_conditional}"
        )

    print("\n===== FINAL RESULT =====")
    print(f"Final sentence probability: {probability}")

    return probability


In [28]:
corpus = [
    "I love comp sci ",
    "We love comp vis ",
    "They love comp sci ",
    "We hate comp sci ",
]

# sentence = "we love comp sci "
sentence = "WE LOVE MATHS  "

sentence_probability(corpus, sentence)


Corpus length: 4
===== STEP 1: TOKENIZE CORPUS =====
Sentence: 'I love comp sci ' -> Tokens: ['i', 'love', 'comp', 'sci']
Sentence: 'We love comp vis ' -> Tokens: ['we', 'love', 'comp', 'vis']
Sentence: 'They love comp sci ' -> Tokens: ['they', 'love', 'comp', 'sci']
Sentence: 'We hate comp sci ' -> Tokens: ['we', 'hate', 'comp', 'sci']

===== STEP 2: COUNT UNIGRAMS =====
Unigram 'i': 1
Unigram 'love': 3
Unigram 'comp': 4
Unigram 'sci': 3
Unigram 'we': 2
Unigram 'vis': 1
Unigram 'they': 1
Unigram 'hate': 1
Total unigrams in corpus: 16

===== STEP 3: COUNT BIGRAMS =====
Bigram ('i', 'love'): 1
Bigram ('love', 'comp'): 3
Bigram ('comp', 'sci'): 3
Bigram ('we', 'love'): 1
Bigram ('comp', 'vis'): 1
Bigram ('they', 'love'): 1
Bigram ('we', 'hate'): 1
Bigram ('hate', 'comp'): 1

===== STEP 4: TOKENIZE INPUT SENTENCE =====
Input sentence tokens: ['we', 'love', 'maths']

===== STEP 5: CALCULATE PROBABILITY =====
P(we) = 2 / 4 = 0.5
P(love | we) = 1 / 2 = 0.5
P(maths | love) = 0 (bigram not fou

0.0

# Laplace

In [35]:
def sentence_probability_lap(corpus, sentence):
    list_len = len(corpus)
    print(f"Corpus length (number of sentences): {list_len}")

    print("===== STEP 1: TOKENIZE CORPUS =====")
    tokenized_corpus = []
    for sent in corpus:
        tokens = sent.lower().split()
        tokenized_corpus.append(tokens)
        print(f"Sentence: '{sent}' -> Tokens: {tokens}")

    print("\n===== STEP 2: COUNT UNIGRAMS =====")
    unigram_counts = {}
    total_unigrams = 0

    for sent in tokenized_corpus:
        for word in sent:
            unigram_counts[word] = unigram_counts.get(word, 0) + 1
            total_unigrams += 1

    for word, count in unigram_counts.items():
        print(f"Unigram '{word}': {count}")

    print(f"Total unigrams in corpus: {total_unigrams}")

    # -------- LAPLACE CHANGE 1 --------
    # Vocabulary size needed for Laplace smoothing
    V = len(unigram_counts)
    print(f"Vocabulary size (V): {V}")
    # ---------------------------------

    print("\n===== STEP 3: COUNT BIGRAMS =====")
    bigram_counts = {}

    for sent in tokenized_corpus:
        for i in range(len(sent) - 1):
            bigram = (sent[i], sent[i + 1])
            bigram_counts[bigram] = bigram_counts.get(bigram, 0) + 1

    for bigram, count in bigram_counts.items():
        print(f"Bigram {bigram}: {count}")

    print("\n===== STEP 4: TOKENIZE INPUT SENTENCE =====")
    sentence_tokens = sentence.lower().split()
    print(f"Input sentence tokens: {sentence_tokens}")

    print("\n===== STEP 5: CALCULATE PROBABILITY (WITH LAPLACE) =====")

    probability = 1.0

    # -------- LAPLACE CHANGE 2 --------
    # P(w1) with Laplace smoothing
    first_word = sentence_tokens[0]
    first_word_count = unigram_counts.get(first_word, 0)

    p_first = (first_word_count + 1) / ( list_len)
    probability *= p_first

    print(
        f"P({first_word}) = ({first_word_count} + 1) / "
        f"({ list_len} ) = {p_first}"
    )
    # ---------------------------------

    # -------- LAPLACE CHANGE 3 --------
    # Conditional probabilities with Laplace smoothing
    for i in range(len(sentence_tokens) - 1):
        prev_word = sentence_tokens[i]
        curr_word = sentence_tokens[i + 1]
        bigram = (prev_word, curr_word)

        bigram_count = bigram_counts.get(bigram, 0)
        prev_word_count = unigram_counts.get(prev_word, 0)

        p_conditional = (bigram_count + 1) / (prev_word_count + V)
        probability *= p_conditional

        print(
            f"P({curr_word} | {prev_word}) = "
            f"({bigram_count} + 1) / ({prev_word_count} + {V}) = {p_conditional}"
        )
    # ---------------------------------

    print("\n===== FINAL RESULT =====")
    print(f"Final sentence probability (with Laplace smoothing): {probability}")

    return probability


In [36]:
corpus = [
    "I love comp sci ",
    "We love comp vis ",
    "They love comp sci ",
    "We hate comp sci ",
]

# sentence = "we love comp sci "
sentence = "WE LOVE MATHS  "

sentence_probability_lap(corpus, sentence)

Corpus length (number of sentences): 4
===== STEP 1: TOKENIZE CORPUS =====
Sentence: 'I love comp sci ' -> Tokens: ['i', 'love', 'comp', 'sci']
Sentence: 'We love comp vis ' -> Tokens: ['we', 'love', 'comp', 'vis']
Sentence: 'They love comp sci ' -> Tokens: ['they', 'love', 'comp', 'sci']
Sentence: 'We hate comp sci ' -> Tokens: ['we', 'hate', 'comp', 'sci']

===== STEP 2: COUNT UNIGRAMS =====
Unigram 'i': 1
Unigram 'love': 3
Unigram 'comp': 4
Unigram 'sci': 3
Unigram 'we': 2
Unigram 'vis': 1
Unigram 'they': 1
Unigram 'hate': 1
Total unigrams in corpus: 16
Vocabulary size (V): 8

===== STEP 3: COUNT BIGRAMS =====
Bigram ('i', 'love'): 1
Bigram ('love', 'comp'): 3
Bigram ('comp', 'sci'): 3
Bigram ('we', 'love'): 1
Bigram ('comp', 'vis'): 1
Bigram ('they', 'love'): 1
Bigram ('we', 'hate'): 1
Bigram ('hate', 'comp'): 1

===== STEP 4: TOKENIZE INPUT SENTENCE =====
Input sentence tokens: ['we', 'love', 'maths']

===== STEP 5: CALCULATE PROBABILITY (WITH LAPLACE) =====
P(we) = (2 + 1) / (4 +

0.004545454545454546

In [37]:
sentence = "i love comp sci"

sentence_probability_lap(corpus, sentence)

Corpus length (number of sentences): 4
===== STEP 1: TOKENIZE CORPUS =====
Sentence: 'I love comp sci ' -> Tokens: ['i', 'love', 'comp', 'sci']
Sentence: 'We love comp vis ' -> Tokens: ['we', 'love', 'comp', 'vis']
Sentence: 'They love comp sci ' -> Tokens: ['they', 'love', 'comp', 'sci']
Sentence: 'We hate comp sci ' -> Tokens: ['we', 'hate', 'comp', 'sci']

===== STEP 2: COUNT UNIGRAMS =====
Unigram 'i': 1
Unigram 'love': 3
Unigram 'comp': 4
Unigram 'sci': 3
Unigram 'we': 2
Unigram 'vis': 1
Unigram 'they': 1
Unigram 'hate': 1
Total unigrams in corpus: 16
Vocabulary size (V): 8

===== STEP 3: COUNT BIGRAMS =====
Bigram ('i', 'love'): 1
Bigram ('love', 'comp'): 3
Bigram ('comp', 'sci'): 3
Bigram ('we', 'love'): 1
Bigram ('comp', 'vis'): 1
Bigram ('they', 'love'): 1
Bigram ('we', 'hate'): 1
Bigram ('hate', 'comp'): 1

===== STEP 4: TOKENIZE INPUT SENTENCE =====
Input sentence tokens: ['i', 'love', 'comp', 'sci']

===== STEP 5: CALCULATE PROBABILITY (WITH LAPLACE) =====
P(i) = (1 + 1) / 

0.0044893378226711555

In [42]:
corpus = [
'patient has fever',
'patient has cough',
'patient has headache',
'doctor treats patient',
'doctor gives medicine',
'medicine reduces fever' ]

In [43]:
#no laplace
sentence_probability(corpus, "patient has fever")


Corpus length: 6
===== STEP 1: TOKENIZE CORPUS =====
Sentence: 'patient has fever' -> Tokens: ['patient', 'has', 'fever']
Sentence: 'patient has cough' -> Tokens: ['patient', 'has', 'cough']
Sentence: 'patient has headache' -> Tokens: ['patient', 'has', 'headache']
Sentence: 'doctor treats patient' -> Tokens: ['doctor', 'treats', 'patient']
Sentence: 'doctor gives medicine' -> Tokens: ['doctor', 'gives', 'medicine']
Sentence: 'medicine reduces fever' -> Tokens: ['medicine', 'reduces', 'fever']

===== STEP 2: COUNT UNIGRAMS =====
Unigram 'patient': 4
Unigram 'has': 3
Unigram 'fever': 2
Unigram 'cough': 1
Unigram 'headache': 1
Unigram 'doctor': 2
Unigram 'treats': 1
Unigram 'gives': 1
Unigram 'medicine': 2
Unigram 'reduces': 1
Total unigrams in corpus: 18

===== STEP 3: COUNT BIGRAMS =====
Bigram ('patient', 'has'): 3
Bigram ('has', 'fever'): 1
Bigram ('has', 'cough'): 1
Bigram ('has', 'headache'): 1
Bigram ('doctor', 'treats'): 1
Bigram ('treats', 'patient'): 1
Bigram ('doctor', 'gives'

0.16666666666666666

In [44]:
0#laplace
sentence_probability_lap(corpus, "patient has fever")

Corpus length (number of sentences): 6
===== STEP 1: TOKENIZE CORPUS =====
Sentence: 'patient has fever' -> Tokens: ['patient', 'has', 'fever']
Sentence: 'patient has cough' -> Tokens: ['patient', 'has', 'cough']
Sentence: 'patient has headache' -> Tokens: ['patient', 'has', 'headache']
Sentence: 'doctor treats patient' -> Tokens: ['doctor', 'treats', 'patient']
Sentence: 'doctor gives medicine' -> Tokens: ['doctor', 'gives', 'medicine']
Sentence: 'medicine reduces fever' -> Tokens: ['medicine', 'reduces', 'fever']

===== STEP 2: COUNT UNIGRAMS =====
Unigram 'patient': 4
Unigram 'has': 3
Unigram 'fever': 2
Unigram 'cough': 1
Unigram 'headache': 1
Unigram 'doctor': 2
Unigram 'treats': 1
Unigram 'gives': 1
Unigram 'medicine': 2
Unigram 'reduces': 1
Total unigrams in corpus: 18
Vocabulary size (V): 10

===== STEP 3: COUNT BIGRAMS =====
Bigram ('patient', 'has'): 3
Bigram ('has', 'fever'): 1
Bigram ('has', 'cough'): 1
Bigram ('has', 'headache'): 1
Bigram ('doctor', 'treats'): 1
Bigram ('tr

0.013736263736263736

In [48]:
sentence_probability_lap(corpus, "fever has patient")

Corpus length (number of sentences): 6
===== STEP 1: TOKENIZE CORPUS =====
Sentence: 'patient has fever' -> Tokens: ['patient', 'has', 'fever']
Sentence: 'patient has cough' -> Tokens: ['patient', 'has', 'cough']
Sentence: 'patient has headache' -> Tokens: ['patient', 'has', 'headache']
Sentence: 'doctor treats patient' -> Tokens: ['doctor', 'treats', 'patient']
Sentence: 'doctor gives medicine' -> Tokens: ['doctor', 'gives', 'medicine']
Sentence: 'medicine reduces fever' -> Tokens: ['medicine', 'reduces', 'fever']

===== STEP 2: COUNT UNIGRAMS =====
Unigram 'patient': 4
Unigram 'has': 3
Unigram 'fever': 2
Unigram 'cough': 1
Unigram 'headache': 1
Unigram 'doctor': 2
Unigram 'treats': 1
Unigram 'gives': 1
Unigram 'medicine': 2
Unigram 'reduces': 1
Total unigrams in corpus: 18
Vocabulary size (V): 10

===== STEP 3: COUNT BIGRAMS =====
Bigram ('patient', 'has'): 3
Bigram ('has', 'fever'): 1
Bigram ('has', 'cough'): 1
Bigram ('has', 'headache'): 1
Bigram ('doctor', 'treats'): 1
Bigram ('tr

0.001201923076923077

In [45]:
sentences = [
"patient has fever",
'fever has patient',
'doctor is medicine',
'fever is patient'

]

K smoothning

In [46]:
def sentence_probability_k(corpus, sentence , k ):
    list_len = len(corpus)
    print(f"Corpus length (number of sentences): {list_len}")

    print("===== STEP 1: TOKENIZE CORPUS =====")
    tokenized_corpus = []
    for sent in corpus:
        tokens = sent.lower().split()
        tokenized_corpus.append(tokens)
        print(f"Sentence: '{sent}' -> Tokens: {tokens}")

    print("\n===== STEP 2: COUNT UNIGRAMS =====")
    unigram_counts = {}
    total_unigrams = 0

    for sent in tokenized_corpus:
        for word in sent:
            unigram_counts[word] = unigram_counts.get(word, 0) + 1
            total_unigrams += 1

    for word, count in unigram_counts.items():
        print(f"Unigram '{word}': {count}")

    print(f"Total unigrams in corpus: {total_unigrams}")

    # -------- LAPLACE CHANGE 1 --------
    # Vocabulary size needed for Laplace smoothing
    V = len(unigram_counts)
    print(f"Vocabulary size (V): {V}")
    # ---------------------------------

    print("\n===== STEP 3: COUNT BIGRAMS =====")
    bigram_counts = {}

    for sent in tokenized_corpus:
        for i in range(len(sent) - 1):
            bigram = (sent[i], sent[i + 1])
            bigram_counts[bigram] = bigram_counts.get(bigram, 0) + 1

    for bigram, count in bigram_counts.items():
        print(f"Bigram {bigram}: {count}")

    print("\n===== STEP 4: TOKENIZE INPUT SENTENCE =====")
    sentence_tokens = sentence.lower().split()
    print(f"Input sentence tokens: {sentence_tokens}")

    print("\n===== STEP 5: CALCULATE PROBABILITY (WITH LAPLACE) =====")

    probability = 1.0

    # -------- LAPLACE CHANGE 2 --------
    # P(w1) with Laplace smoothing
    first_word = sentence_tokens[0]
    first_word_count = unigram_counts.get(first_word, 0)

    p_first = (first_word_count + k) / ( list_len + (k*V))
    probability *= p_first

    print(
        f"P({first_word}) = ({first_word_count} + 1) / "
        f"({ list_len} ) = {p_first}"
    )
    # ---------------------------------

    # -------- LAPLACE CHANGE 3 --------
    # Conditional probabilities with Laplace smoothing
    for i in range(len(sentence_tokens) - 1):
        prev_word = sentence_tokens[i]
        curr_word = sentence_tokens[i + 1]
        bigram = (prev_word, curr_word)

        bigram_count = bigram_counts.get(bigram, 0)
        prev_word_count = unigram_counts.get(prev_word, 0)

        p_conditional = (bigram_count + k) / (prev_word_count + (k*V))
        probability *= p_conditional

        print(
            f"P({curr_word} | {prev_word}) = "
            f"({bigram_count} + 1) / ({prev_word_count} + {k*V}) = {p_conditional}"
        )
    # ---------------------------------

    print("\n===== FINAL RESULT =====")
    print(f"Final sentence probability (with Laplace smoothing): {probability}")

    return probability


In [47]:
sentence_probability_k(corpus, "patient has fever" , 0.1)

Corpus length (number of sentences): 6
===== STEP 1: TOKENIZE CORPUS =====
Sentence: 'patient has fever' -> Tokens: ['patient', 'has', 'fever']
Sentence: 'patient has cough' -> Tokens: ['patient', 'has', 'cough']
Sentence: 'patient has headache' -> Tokens: ['patient', 'has', 'headache']
Sentence: 'doctor treats patient' -> Tokens: ['doctor', 'treats', 'patient']
Sentence: 'doctor gives medicine' -> Tokens: ['doctor', 'gives', 'medicine']
Sentence: 'medicine reduces fever' -> Tokens: ['medicine', 'reduces', 'fever']

===== STEP 2: COUNT UNIGRAMS =====
Unigram 'patient': 4
Unigram 'has': 3
Unigram 'fever': 2
Unigram 'cough': 1
Unigram 'headache': 1
Unigram 'doctor': 2
Unigram 'treats': 1
Unigram 'gives': 1
Unigram 'medicine': 2
Unigram 'reduces': 1
Total unigrams in corpus: 18
Vocabulary size (V): 10

===== STEP 3: COUNT BIGRAMS =====
Bigram ('patient', 'has'): 3
Bigram ('has', 'fever'): 1
Bigram ('has', 'cough'): 1
Bigram ('has', 'headache'): 1
Bigram ('doctor', 'treats'): 1
Bigram ('tr

0.09986428571428571