In [2]:
# triples of 'word', pos count, neg count
dataset = [
    ('I', 3, 3),
    ('am', 3, 3),
    ('happy', 2, 1),
    ('because', 1, 0),
    ('learning', 1, 1),
    ('nlp', 1, 1),
    ('sad', 1, 2),
    ('not', 1, 2),
]

In [3]:
n_pos = sum(map(lambda triple: triple[1], dataset))
n_neg = sum(map(lambda triple: triple[2], dataset))

n_pos, n_neg

(13, 13)

In [6]:
# Calculating P(word_i | class)

dataset_prob = []

for triple in dataset:
    pos_prob = triple[1] / n_pos
    neg_prob = triple[2] / n_neg
    dataset_prob.append((triple[0], pos_prob, neg_prob))

dataset_prob

[('I', 0.23076923076923078, 0.23076923076923078),
 ('am', 0.23076923076923078, 0.23076923076923078),
 ('happy', 0.15384615384615385, 0.07692307692307693),
 ('because', 0.07692307692307693, 0.0),
 ('learning', 0.07692307692307693, 0.07692307692307693),
 ('nlp', 0.07692307692307693, 0.07692307692307693),
 ('sad', 0.07692307692307693, 0.15384615384615385),
 ('not', 0.07692307692307693, 0.15384615384615385)]

In [9]:
# With Laplacian Smoothing

dataset_prob_lap = []

v_class = len(dataset) # Number of words
print(v_class)

for triple in dataset:
    pos_prob = (triple[1] + 1) / (n_pos + v_class)
    neg_prob = (triple[2] + 1)/ (n_neg + v_class)
    dataset_prob_lap.append((triple[0], pos_prob, neg_prob))

dataset_prob_lap

8


[('I', 0.19047619047619047, 0.19047619047619047),
 ('am', 0.19047619047619047, 0.19047619047619047),
 ('happy', 0.14285714285714285, 0.09523809523809523),
 ('because', 0.09523809523809523, 0.047619047619047616),
 ('learning', 0.09523809523809523, 0.09523809523809523),
 ('nlp', 0.09523809523809523, 0.09523809523809523),
 ('sad', 0.09523809523809523, 0.14285714285714285),
 ('not', 0.09523809523809523, 0.14285714285714285)]

In [16]:
# Log likelihood - logarithms of probabilities

# Sentiments: Positive | Neutral | Negative

def ratio(triple):
    # Frequency positive + 1/ Frequency negative + 1
    # NOTE: we need Laplacian smoothing, otherwise we can get division by zero 
    P_pos = triple[1]
    P_neg = triple[2]
    return triple[0], P_pos / P_neg


ratios = list(map(lambda triple: ratio(triple), dataset_prob_lap))
ratios


[('I', 1.0),
 ('am', 1.0),
 ('happy', 1.5),
 ('because', 2.0),
 ('learning', 1.0),
 ('nlp', 1.0),
 ('sad', 0.6666666666666666),
 ('not', 0.6666666666666666)]

In [None]:
# Classification with Naive Bayes

# P_pos / P_neg (prior)
# product = 1
# for ratio in ratios
#   product *= ratio

# if product > 1 -> positive

# Numerical underflow can be avoided with log likelihood



In [25]:
# Classify one tweet

tweet = "I am happy because I am learning"
words = tweet.split(" ")
unique_words = list(set(words))

print(unique_words)

def get_word_in_tuple_dict(key, tuple_dict):
    for tuple in tuple_dict:
        if (key == tuple[0]):
            return tuple[1:]
    return None

# get_word_in_tuple_dict('I', dataset_prob_lap)

import math

# inference result will be a sum of log ratios
prior = n_pos / n_neg
# We add prior immediately, is important for imbalanced datasets
inference_result = 0 + math.log(prior)
for word in unique_words:
    ratio = get_word_in_tuple_dict(word, ratios)
    inference_result += math.log(ratio[0])
    
inference_result

print(inference_result)
if (inference_result > 0): # NOTE: >0, not >1, because these are log values
    print("Positive")
else:
    print("Negative")

['happy', 'am', 'I', 'learning', 'because']
1.0986122886681096
Positive
