Madhur Bhalkar/ April 2025/ NLP/ Text Classifier using Markov Model concepts

We will leverage Bayes decision rule 

Create a Text Classifier to predict if lines of Poem belong to Edgar Allen Por or Robert Frost 

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import string
from sklearn.model_selection import train_test_split

In [4]:
input_files = [
  'edgar_allan_poe.txt',
  'robert_frost.txt',
]

In [5]:
!head edgar_allan_poe.txt

LO! Death hath rear'd himself a throne
In a strange city, all alone,
Far down within the dim west
Where the good, and the bad, and the worst, and the best,
Have gone to their eternal rest.
â
There shrines, and palaces, and towers
Are not like any thing of ours
Oh no! O no! ours never loom
To heaven with that ungodly gloom!


In [6]:
!head robert_frost.txt


Two roads diverged in a yellow wood,
And sorry I could not travel both
And be one traveler, long I stood
And looked down one as far as I could
To where it bent in the undergrowth; 

Then took the other, as just as fair,
And having perhaps the better claim
Because it was grassy and wanted wear,
Though as for that the passing there


In [7]:
# collect data into lists
input_texts = []
labels = []

for label, f in enumerate(input_files):
  print(f"{f} corresponds to label {label}")

  for line in open(f):
    line = line.rstrip().lower()
    if line:
      # remove punctuation
      line = line.translate(str.maketrans('', '', string.punctuation))

      input_texts.append(line)
      labels.append(label)

edgar_allan_poe.txt corresponds to label 0
robert_frost.txt corresponds to label 1


In [None]:
#split into test and train 

In [8]:
train_text, test_text, Ytrain, Ytest = train_test_split(input_texts, labels)


In [9]:
len(Ytrain), len(Ytest)


(1618, 540)

In [10]:
train_text[:5]


['and hold the door but the bones didnt try',
 'of the bells bells bells',
 'one ought not to be thrown into confusion',
 'toffile agreed to that and sure enough',
 'henceforth i hold thy flowerenameled shore']

In [11]:
Ytrain[:5]

[1, 0, 1, 1, 0]

In [12]:
idx = 1
word2idx = {'<unk>': 0}

In [13]:
# populate word2idx
for text in train_text:
    tokens = text.split()
    for token in tokens:
      if token not in word2idx:
        word2idx[token] = idx
        idx += 1

In [14]:
word2idx


{'<unk>': 0,
 'and': 1,
 'hold': 2,
 'the': 3,
 'door': 4,
 'but': 5,
 'bones': 6,
 'didnt': 7,
 'try': 8,
 'of': 9,
 'bells': 10,
 'one': 11,
 'ought': 12,
 'not': 13,
 'to': 14,
 'be': 15,
 'thrown': 16,
 'into': 17,
 'confusion': 18,
 'toffile': 19,
 'agreed': 20,
 'that': 21,
 'sure': 22,
 'enough': 23,
 'henceforth': 24,
 'i': 25,
 'thy': 26,
 'flowerenameled': 27,
 'shore': 28,
 'hed': 29,
 'say': 30,
 'she': 31,
 'does': 32,
 'it': 33,
 'more': 34,
 'because': 35,
 'likes': 36,
 'ecstasies': 37,
 'above': 38,
 'detail': 39,
 'burned': 40,
 'dissolved': 41,
 'broken': 42,
 'off': 43,
 'come': 44,
 'up': 45,
 'in': 46,
 'despite': 47,
 'lion': 48,
 'a': 49,
 'piercing': 50,
 'little': 51,
 'star': 52,
 'was': 53,
 'through': 54,
 'truth': 55,
 'virtue': 56,
 'humanity': 57,
 'whole': 58,
 'edition': 59,
 'packing': 60,
 'case': 61,
 'realms': 62,
 'boreal': 63,
 'pole': 64,
 'he': 65,
 'must': 66,
 'have': 67,
 'changed': 68,
 'his': 69,
 'mind': 70,
 'gone': 71,
 'garlands': 72,


In [15]:
len(word2idx)


2506

In [16]:
# convert data into integer format
train_text_int = []
test_text_int = []

for text in train_text:
  tokens = text.split()
  line_as_int = [word2idx[token] for token in tokens]
  train_text_int.append(line_as_int)

for text in test_text:
  tokens = text.split()
  line_as_int = [word2idx.get(token, 0) for token in tokens]
  test_text_int.append(line_as_int)

In [17]:
train_text_int[100:105]


[[107, 377, 378, 264, 3, 374],
 [49, 379, 136, 380, 129, 69, 381, 53, 382],
 [253, 383, 384, 385, 386, 3, 387],
 [11, 388, 389, 390, 3, 391],
 [14, 30, 392, 393, 177, 394, 1, 392, 177, 395]]

In [18]:
# initialize A and pi matrices - for both classes
V = len(word2idx)

A0 = np.ones((V, V))
pi0 = np.ones(V)

A1 = np.ones((V, V))
pi1 = np.ones(V)

In [19]:
# compute counts for A and pi
def compute_counts(text_as_int, A, pi):
  for tokens in text_as_int:
    last_idx = None
    for idx in tokens:
      if last_idx is None:
        # it's the first word in a sentence
        pi[idx] += 1
      else:
        # the last word exists, so count a transition
        A[last_idx, idx] += 1

      # update last idx
      last_idx = idx


compute_counts([t for t, y in zip(train_text_int, Ytrain) if y == 0], A0, pi0)
compute_counts([t for t, y in zip(train_text_int, Ytrain) if y == 1], A1, pi1)

In [20]:
# normalize A and pi so they are valid probability matrices
# convince yourself that this is equivalent to the formulas shown before
A0 /= A0.sum(axis=1, keepdims=True)
pi0 /= pi0.sum()

A1 /= A1.sum(axis=1, keepdims=True)
pi1 /= pi1.sum()

In [21]:
# log A and pi since we don't need the actual probs
logA0 = np.log(A0)
logpi0 = np.log(pi0)

logA1 = np.log(A1)
logpi1 = np.log(pi1)

In [22]:
# compute priors
count0 = sum(y == 0 for y in Ytrain)
count1 = sum(y == 1 for y in Ytrain)
total = len(Ytrain)
p0 = count0 / total
p1 = count1 / total
logp0 = np.log(p0)
logp1 = np.log(p1)
p0, p1


(0.323238566131026, 0.676761433868974)

In [23]:
# build a classifier
class Classifier:
  def __init__(self, logAs, logpis, logpriors):
    self.logAs = logAs
    self.logpis = logpis
    self.logpriors = logpriors
    self.K = len(logpriors) # number of classes

  def _compute_log_likelihood(self, input_, class_):
    logA = self.logAs[class_]
    logpi = self.logpis[class_]

    last_idx = None
    logprob = 0
    for idx in input_:
      if last_idx is None:
        # it's the first token
        logprob += logpi[idx]
      else:
        logprob += logA[last_idx, idx]
      
      # update last_idx
      last_idx = idx
    
    return logprob
  
  def predict(self, inputs):
    predictions = np.zeros(len(inputs))
    for i, input_ in enumerate(inputs):
      posteriors = [self._compute_log_likelihood(input_, c) + self.logpriors[c] \
             for c in range(self.K)]
      pred = np.argmax(posteriors)
      predictions[i] = pred
    return predictions

In [24]:
# each array must be in order since classes are assumed to index these lists
clf = Classifier([logA0, logA1], [logpi0, logpi1], [logp0, logp1])


#MAIN PREDICTION

In [25]:
Ptrain = clf.predict(train_text_int)
print(f"Training Model Accuracy is : {np.mean(Ptrain == Ytrain)}")

Training Model Accuracy is : 0.992583436341162


In [26]:
Ptest = clf.predict(test_text_int)
print(f"Testing Model Accuracy is : {np.mean(Ptest == Ytest)}")

Testing Model Accuracy is : 0.7962962962962963


#cheking for imbalance via confusion matrix & F score

In [30]:
from sklearn.metrics import confusion_matrix, f1_score



In [31]:
cm = confusion_matrix(Ytrain, Ptrain)
cm

array([[ 536,    7],
       [   0, 1075]])

In [32]:
cm_test = confusion_matrix(Ytest, Ptest)
cm_test

array([[106,  73],
       [ 18, 343]])

In [33]:
f1_score(Ytrain, Ptrain)


0.9967547519703291

In [34]:
f1_score(Ytest, Ptest)


0.8828828828828829