In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import string

In [3]:
input_files = [
    'edgar_allan_poe.txt',
    'robert_frost.txt'
]

In [4]:
input_text = []
labels = []

for label, file in enumerate(input_files):
  for line in open(file):
    line = line.rstrip().lower()
    if line:
      line = line.translate(str.maketrans('', '', string.punctuation)) # Removes all Punctuations from the line
      input_text.append(line)
      labels.append(label)

In [5]:
input_text

['lo death hath reard himself a throne',
 'in a strange city all alone',
 'far down within the dim west',
 'where the good and the bad and the worst and the best',
 'have gone to their eternal rest',
 'there shrines and palaces and towers',
 'are not like any thing of ours',
 'oh no o no ours never loom',
 'to heaven with that ungodly gloom',
 'timeeaten towers that tremble not',
 'resemble nothing that is ours',
 'around by lifting winds forgot',
 'resignedly beneath the sky',
 'the melancholy waters lie',
 'no holy rays from heaven come down',
 'on the long nighttime of that town',
 'but light from out the lurid sea',
 'streams up the turrets silently',
 'up thrones up longforgotten bowers',
 'of sculturd ivy and stone flowers',
 'up domes up spires up kingly halls',
 'up fanes up babylonlike walls',
 'up many a melancholy shrine',
 'whose entablatures intertwine',
 'the mask the viol and the vine',
 'there open temples open graves',
 'are on a level with the waves',
 'but not the ri

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
Xtrain, Xtest, ytrain, ytest = train_test_split(input_text, labels)

In [8]:
word2idx = {'<unknown>' : 0}
idx = 1
for line in Xtrain:
  tokens = line.split()
  for token in tokens:
    if token not in word2idx:
      word2idx[token] = idx
      idx+=1

In [9]:
word2idx

{'<unknown>': 0,
 'the': 1,
 'hours': 2,
 'of': 3,
 'daylight': 4,
 'gather': 5,
 'atmosphere': 6,
 'for': 7,
 'gods': 8,
 'sake': 9,
 'arent': 10,
 'you': 11,
 'fond': 12,
 'viewing': 13,
 'nature': 14,
 'and': 15,
 'love': 16,
 'a': 17,
 'simple': 18,
 'duty': 19,
 'if': 20,
 'i': 21,
 'could': 22,
 'see': 23,
 'it': 24,
 'or': 25,
 'else': 26,
 'mow': 27,
 'room': 28,
 'all': 29,
 'way': 30,
 'home': 31,
 'kept': 32,
 'remembering': 33,
 'its': 34,
 'looking': 35,
 'another': 36,
 'door': 37,
 'to': 38,
 'try': 39,
 'provision': 40,
 'there': 41,
 'had': 42,
 'been': 43,
 'just': 44,
 'such': 45,
 'meeting': 46,
 'springs': 47,
 'that': 48,
 'neer': 49,
 'did': 50,
 'flow': 51,
 'threw': 52,
 'myself': 53,
 'wetelbowed': 54,
 'wetkneed': 55,
 'open': 56,
 'temples': 57,
 'graves': 58,
 'about': 59,
 'where': 60,
 'henry': 61,
 'hudsons': 62,
 'gone': 63,
 'lo': 64,
 'death': 65,
 'hath': 66,
 'reard': 67,
 'himself': 68,
 'throne': 69,
 'jupiter': 70,
 'he': 71,
 'might': 72,
 'not'

In [10]:
train_as_int = []
test_as_int = []

# for line in Xtrain:
#   tokens = line.split()
#   for token in tokens:
#     line_as_int = word2idx[token]
#     train_as_int.append(line_as_int)

# for line in Xtest:
#   tokens = line.split()
#   for token in tokens:
#     if token not in word2idx:
#       token = '<unknown>'
#     line_as_int = word2idx[token]
#     test_as_int.append(line_as_int)

for text in Xtrain:
  tokens = text.split()
  line_as_int = [word2idx[token] for token in tokens]
  train_as_int.append(line_as_int)

for text in Xtest:
  tokens = text.split()
  line_as_int = [word2idx.get(token, 0) for token in tokens]
  test_as_int.append(line_as_int)

In [11]:
train_as_int[100:105]

[[80, 274, 234, 387, 181, 134, 388, 389],
 [206, 390, 267, 17, 391, 150, 17, 392],
 [3, 393, 394, 15, 395, 16],
 [101, 102, 103, 104, 17, 218, 105],
 [1, 396, 1, 397, 15, 1, 398]]

In [13]:
test_as_int[100:105]

[[0, 104, 0, 249, 0, 94, 0],
 [21, 318, 32, 373, 104, 1, 0, 0],
 [48, 0, 112, 680, 935, 1, 0, 2063],
 [272, 157, 158, 159, 160, 38, 34, 161],
 [48, 165, 24, 550, 10, 1154, 3, 254]]

## Create a Markov Model

In [14]:
V = len(word2idx)

In [15]:
A0 = np.ones(shape = (V,V)) # Probability Distribution for Label '0': Edgar Allan Poe
Pi0 = np.ones(shape = (V))

A1 = np.ones(shape = (V,V)) # Probability Distribution for Label '1': Robert Frost
Pi1 = np.ones(shape = (V))

In [16]:
def counts(text_as_int, A, Pi):
  for tokens in text_as_int: # Tokens refer to each line as int
    print(tokens)

In [18]:
counts([t for t, y in zip(train_as_int, ytrain) if y==0], A0, Pi0) # t(train_as_int) is passed as parameter when it's label is 0
counts([t for t, y in zip(test_as_int, ytest) if y==1], A1, Pi1) # t(test_as_int) is passed as parameter when it's label is 1

[15, 16, 17, 18, 19]
[38, 47, 48, 49, 50, 51]
[41, 56, 57, 56, 58]
[64, 65, 66, 67, 68, 17, 69]
[71, 72, 73, 74, 75, 76, 77]
[90, 1, 91, 3, 92, 93]
[65, 122, 123, 109]
[124, 125, 126, 48, 127, 128, 129]
[38, 138, 17, 139, 104, 140, 141, 142]
[1, 143, 144, 15, 124, 125]
[150, 17, 151, 141, 152]
[157, 158, 159, 160, 38, 17, 161]
[162, 90, 29, 1, 163]
[186, 187, 188, 189, 190]
[90, 213, 82, 60, 214]
[1, 220, 221, 155, 222, 15, 223]
[1, 237, 238, 186, 239, 240, 241]
[157, 242, 73, 1, 243, 244, 3, 245]
[104, 262, 263, 264, 229, 71, 90, 24]
[15, 134, 269, 48, 11, 270, 24, 271]
[272, 148, 273, 274, 275, 267, 224]
[48, 21, 291, 21, 291, 109, 224]
[1, 292, 293, 3, 1, 294, 295, 239]
[296, 254, 297, 104, 298, 299, 300]
[307, 308, 309, 17, 152, 3, 310]
[60, 1, 317, 3, 146, 318, 187, 31]
[307, 319, 320, 3, 17, 321, 48, 149]
[307, 325, 157, 326, 29, 269, 276, 327]
[1, 328, 302, 15, 1, 329, 330]
[181, 298, 354, 355, 3, 356, 357]
[374, 375, 73, 376, 377, 186, 234, 378]
[380, 34, 246, 108, 381, 229, 17

In [20]:
# Counts the number of words for Pi and A
def counts(text_as_int, A, Pi):
  previous_word = None
  for tokens in text_as_int: # Tokens refer to each line as int
    for ongoing_word in tokens:
      if previous_word == None:
        Pi[ongoing_word] += 1
      else:
        A[previous_word][ongoing_word] +=1
      previous_word = ongoing_word

In [21]:
counts([t for t, y in zip(train_as_int, ytrain) if y==0], A0, Pi0)
counts([t for t, y in zip(test_as_int, ytest) if y==1], A1, Pi1)

In [28]:
Pi0  /= Pi0.sum()
Pi1 /= Pi1.sum()

A0 /= A0.sum(axis=1, keepdims = True)
A1 /= A1.sum(axis=1, keepdims = True)

In [31]:
loppi0 = np.log(Pi0)
logpi1 = np.log(Pi1)

logA0 = np.log(A0)
logA1 = np.log(A1)

In [32]:
counts0 = sum(y == 0 for y in ytrain)
counts1 = sum(y == 1 for y in ytrain)

prior0 = counts0 / len(ytrain)
prior1 = counts1 / len(ytrain)

logPrior0 = np.log(prior0)
logPrior1 = np.log(prior1)

In [33]:
print(prior0)
print(prior1)
print(logPrior0)
print(logPrior1)

0.3281733746130031
0.6718266253869969
-1.1142132291105893
-0.39775496968219703


In [47]:
class Classifier:
  def __init__(self, logA, logPi, logPrior):
    self.logA = logA
    self.logPi = logPi
    self.logPrior = logPrior
    self.K = len(logPrior)

  def compute_log_probabilities(self, input_, class_):
    logA = self.logA[class_]
    logPi = self.logPi[class_]

    previous_token = None
    log_probs = 0
    for token in input_:
      if previous_token == None:
        log_probs += logPi[token]
      else:
        log_probs += logA[previous_token][token]

      previous_token = token
    return log_probs

  def predict(self, input):
    predictions = np.zeros((len(input)))
    for i, input_ in enumerate(input):
      posteriors = [self.compute_log_probabilities(input_, class_) + self.logPrior[class_] for class_ in range(self.K)]
      predictions[i] = np.argmax(posteriors)
    return predictions

In [48]:
classifier = Classifier([logA0, logA1], [loppi0, logpi1], [logPrior0, logPrior1])

In [49]:
Probs_trainset = classifier.predict(train_as_int)
Probs_testset = classifier.predict(test_as_int)

In [50]:
print(f'Training Accuracy: {np.mean(Probs_trainset == ytrain)}')
print(f'Test Accuracy: {np.mean(Probs_testset == ytest)}')

Training Accuracy: 0.9219814241486068
Test Accuracy: 0.8070500927643784


In [51]:
from sklearn.metrics import confusion_matrix, f1_score

In [52]:
cm = confusion_matrix(ytrain, Probs_trainset)
cm

array([[527,   3],
       [123, 962]])

In [54]:
cm = confusion_matrix(ytest, Probs_testset)
cm

array([[ 84, 104],
       [  0, 351]])

In [55]:
f1 = f1_score( ytrain, Probs_trainset)
f1

0.9385365853658537

In [56]:
f1 = f1_score( ytest, Probs_testset)
f1

0.870967741935484