# Hidden Markov Model (HMM)

## Setup

In [1]:
# TODO: replace words with word buckets
import os
import sys
import json
import pandas as pd
import matplotlib.pyplot as plt
import re
from collections import OrderedDict, Counter
import itertools as it
from functools import reduce 
import time
import numpy as np

In [2]:
data=pd.read_json('../data/Sarcasm_Headlines_Dataset.json', lines=True)
data[['headline','is_sarcastic']].head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


In [3]:
# remove upper case, weird white space and punctuation
data['headline'] = data['headline'].apply(lambda x: x.lower())
data['headline'] = data['headline'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
data[['headline','is_sarcastic']].head()
data = data[['headline','is_sarcastic']]
x, y = (data['headline'].values, data['is_sarcastic'].values)

## Characterize the Data

In [4]:
# This is our entire dictionary of words
c=Counter(" ".join(x).split(" ")) 
len(c)

28406

In [5]:
# Characterization of the two classes, the first is sarcastic
csarcastic=Counter(" ".join(x[y==1]).split(" "))
print("Words missing from the sarcastic dictionary: ", len(x)-len(csarcastic),"\n")
print("Most common sarcastic words: ", csarcastic.most_common(20),"\n")
unique_serious = [word for word in c.most_common() if word[0] not in csarcastic]
print("Most common words unique to the serious dictionary: ", unique_serious[:20],"\n")
singleton_sarcastic = [word[0] for word in csarcastic.most_common() if word[1]==1]
print("Number of singletons in the sarcastic dictionary: ", len(singleton_sarcastic))

Words missing from the sarcastic dictionary:  7557 

Most common sarcastic words:  [('to', 4145), ('of', 3132), ('in', 1757), ('for', 1419), ('on', 1056), ('man', 1032), ('with', 855), ('new', 839), ('the', 651), ('by', 581), ('from', 561), ('at', 546), ('a', 525), ('area', 477), ('out', 468), ('up', 430), ('report', 426), ('about', 410), ('after', 383), ('it', 383)] 

Most common words unique to the serious dictionary:  [('huffpost', 58), ('queer', 54), ('trans', 49), ('kardashian', 47), ('jenner', 40), ('lgbt', 40), ('lgbtq', 32), ('roundup', 32), ('instagram', 30), ('trevor', 30), ('noah', 29), ('funniest', 28), ('schumer', 27), ('kimmel', 25), ('huffpollster', 24), ('hill', 24), ('chrissy', 22), ('veterans', 21), ('uk', 21), ('conservatives', 21)] 

Number of singletons in the sarcastic dictionary:  9895


In [6]:
cserious=Counter(" ".join(x[y==0]).split(" "))
print("Words missing from the serious dictionary: ",len(x)-len(cserious),"\n")
print("Most common serious words: ", cserious.most_common(20),"\n")
unique_sarcastic = [word for word in c.most_common() if word[0] not in cserious]
print("Most common words unique to the sarcastic dictionary: ", unique_sarcastic[:20],"\n")
singleton_serious = [word[0] for word in cserious.most_common() if word[1]==1]
print("Number of singletons in the sarcastic dictionary: ", len(singleton_serious))

Words missing from the serious dictionary:  7656 

Most common serious words:  [('the', 4741), ('to', 4074), ('a', 2475), ('of', 2472), ('in', 2429), ('for', 1886), ('and', 1659), ('is', 1508), ('on', 1336), ('trump', 1046), ('with', 946), ('you', 779), ('this', 704), ('new', 677), ('from', 664), ('how', 649), ('about', 646), ('at', 645), ('your', 576), ('are', 563)] 

Most common words unique to the sarcastic dictionary:  [('fucking', 67), ('shit', 61), ('clearly', 43), ('fuck', 42), ('unable', 33), ('realizes', 33), ('archives', 33), ('recommends', 28), ('per', 27), ('asshole', 26), ('currently', 25), ('relieved', 21), ('unsure', 20), ('recommend', 18), ('remaining', 17), ('capable', 16), ('panicked', 15), ('frantically', 15), ('shitty', 15), ('stares', 15)] 

Number of singletons in the sarcastic dictionary:  9592


In [7]:
# get a list of the top words
top=list(it.chain(*c.most_common(1000)))[::2] 
# How many headlines only have those words?
only=[all([(word in top) for word in data['headline'][i].split(" ")]) for i in range(len(data))]
sum(only)

382

In [8]:
small_data=data[only]
small_data.head()
# We shall split the input and output into two np arrays
x, y = (small_data['headline'].values, small_data['is_sarcastic'].values)

In [9]:
csmall=Counter(" ".join(x).split(" "))
smalltop=list(it.chain(*csmall.most_common(1000)))[::2] # get a list of the top words
numwords=len(smalltop)
numwords

695

# Toy Data

In [10]:
# We need to test and make sure that this method works on a toy example
x = ["my name is sam", "my name is not jerry", 'his name is john', 'his dogs name is jeff', 'his dogs name is not simon']
ctest=Counter(" ".join(x).split(" "))
smalltest=list(it.chain(*ctest.most_common()))[::2] # get a list of the top words
numwords=len(smalltest)
print(numwords)
print(smalltest)

11
['name', 'is', 'his', 'my', 'not', 'dogs', 'sam', 'jerry', 'john', 'jeff', 'simon']


In [11]:
# Produce the transition matrix
embedding={word:i for i, word in enumerate(smalltest)}
test_matrix=np.zeros((numwords+1,numwords+1))+0.001
print(test_matrix)

[[0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001]
 [0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001]
 [0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001]
 [0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001]
 [0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001]
 [0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001]
 [0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001]
 [0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001]
 [0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001]
 [0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001]
 [0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001]
 [0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001]]


In [12]:
for i,headline in enumerate(x):
    which = test_matrix
    prev = numwords
    for word in headline.split(" "):
        which[prev, embedding[word]] += 1
        prev = embedding[word]
    which[prev, numwords] += 1

In [13]:
print(smalltest)
np.round(test_matrix)

['name', 'is', 'his', 'my', 'not', 'dogs', 'sam', 'jerry', 'john', 'jeff', 'simon']


array([[0., 5., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 2., 0., 1., 0., 1., 1., 0., 0.],
       [1., 0., 0., 0., 0., 2., 0., 0., 0., 0., 0., 0.],
       [2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0.],
       [2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 3., 2., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [14]:
# Make the two np arrays into probability rows
normalize = lambda x: x/np.sum(x,1,keepdims=True)
test_matrix = normalize(test_matrix)
np.round(test_matrix,1)

array([[0. , 1. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0.4, 0. , 0.2, 0. , 0.2, 0.2, 0. , 0. ],
       [0.3, 0. , 0. , 0. , 0. , 0.7, 0. , 0. , 0. , 0. , 0. , 0. ],
       [1. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0. , 0. , 0.5, 0. ],
       [1. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 1. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 1. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 1. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 1. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 1. ],
       [0. , 0. , 0.6, 0.4, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ]])

If this works, each row should add up to 1.
Each row corresponds to a word, and each value in that row corresponds to the frequency of the word associated with that column to come after it. The last row represents the beginning of a statement, not a word. The last column represents the end of a statement, not a word either.

In [15]:
for i in range(10):
    end = numwords
    word = int(np.random.choice(np.arange(numwords+1),1,p=test_matrix[numwords,]))
    headline = ''
    while word != end:
        # append the word
        headline = headline + " " + smalltest[word] 
        # get word index, it's originally a 1-size list, not int
        word = int(np.random.choice(np.arange(numwords+1),1,p=test_matrix[word,]))
    print(headline)

 his dogs name is not simon
 my name is not jerry
 his name is sam
 his name is jeff
 my name is jeff
 his name is not simon
 his dogs name is not jerry
 his dogs name is not jerry
 my name is john
 his dogs name is john


# HMM Functions

In [59]:
# Get the word list for the sarcastic
print(len(csarcastic))
# get a list of the top words
top=list(it.chain(*csarcastic.most_common(1000)))[::2] 
# How many headlines only have those words?
x, y = (data['headline'].values, data['is_sarcastic'].values)
only = [all([(word in top) for word in x[i].split(" ")]) for i in range(len(data))]
print(sum(only)) # How many sentences only contain sarcastic words
print(sum(only&y)) # How many sentences are sarcastic and only contain sarcastic words

19152
278
136


In [60]:
sarcastic_data=data[np.logical_and(only,y)]
sarcastic_data.head()
# We shall split the input and output into two np arrays
xsarcastic, ysarcastic = (sarcastic_data['headline'].values, sarcastic_data['is_sarcastic'].values)
len(xsarcastic)

136

In [61]:
# Produce the transition matrix and embedding
embed = lambda counter: {word[0]:i for i, word in enumerate(counter)}
csmall_sarcastic = Counter(" ".join(xsarcastic).split(" "))
# This embedding is string:int
sarcastic_embedding = embed(csmall_sarcastic.most_common())
# This is just a list of the words
sarcastic_words = list(it.chain(*csmall_sarcastic.most_common()))[::2] 
print(len(sarcastic_embedding))
numwords = len(sarcastic_embedding)
sarcastic_matrix = np.zeros((numwords+1,numwords+1))+0.001

484


In [62]:
def train_HMM(headlines, matrix, embedding):
    """
    This function will in-place train an HMM transition matrix
    params:
        headlines: pandas dataframe of string sentences
        matrix: an empty numpy matrix that has a row and column for each word plus another one for the end and start tokens
        embedding: a dictionary from words to unique integer values
    output:
        None
    """
    numwords = len(embedding)
    for i,headline in enumerate(headlines):
        prev = numwords
        for word in headline.split(" "):
            matrix[prev, embedding[word]] += 1
            prev = embedding[word]
        matrix[prev, numwords] += 1

In [63]:
train_HMM(xsarcastic, sarcastic_matrix, sarcastic_embedding)
sarcastic_matrix[1:5,1:5]

array([[1.0000e-03, 1.0000e-03, 1.0000e-03, 2.0010e+00],
       [1.0000e-03, 1.0000e-03, 1.0000e-03, 1.0000e-03],
       [1.3001e+01, 1.0000e-03, 1.0000e-03, 1.0000e-03],
       [1.0000e-03, 1.0000e-03, 1.0000e-03, 1.0000e-03]])

In [64]:
# Make the np array into probability rows
normalize = lambda x: x/np.sum(x,1,keepdims=True)
sarcastic_matrix = normalize(sarcastic_matrix)

485

In [101]:
def generate_headline(matrix, corpus):
    """
    A function to generate headlines given a word transition matrix and a corpus
    params:
        matrix: numpy matrix of word transition proportions. The last row and column refer to the start and end of headline tokens
        corpus: a python list of all words in the corpus. The words should be in the same order as for the matrix
    output:
        headline: a string that is our generated headline
    """
    numwords = matrix.shape[1]-1
    end = numwords
    word = int(np.random.choice(np.arange(numwords+1),1,p=matrix[numwords,]))
    headline = ''
    while word != end:
        # append the word
        headline = headline + " " + corpus[word]
        # get word index, it's originally returned as a 1-size list, so we must convert to int
        word = int(np.random.choice(np.arange(numwords+1),1,p=matrix[word,])) 
    return(headline)

In [83]:
# Now let's generate the headlines
num_gen = 1000
sarcastic_headlines = [generate_headline(sarcastic_matrix, sarcastic_words) for _ in range(num_gen)]

In [89]:
# Now that we have headlines, let's see if our classifiers can deal recognize it
from Simple_Classifiers import run_algorithm, NB, NB_rank, accuracy

In [87]:
sarcastic_test = pd.DataFrame({'headline':sarcastic_headlines, 'is_sarcastic':np.ones(num_gen)})
sarcastic_test.head()

Unnamed: 0,headline,is_sarcastic
0,report well here we go right for help,1.0
1,pope francis admits god debate desk doesnt ha...,1.0
2,royal baby has pretty sure he goes to see a p...,1.0
3,breaking still nothing unveils president bush...,1.0
4,area man in his life,1.0


In [91]:
confusion_matrix, duration = run_algorithm(data, sarcastic_test, NB)
acc = accuracy(confusion_matrix)
acc

0.792

So it seems that around 80% of the time, the classifier can correctly classify these generated strings. Let's compare that to using the entire data itself.

In [92]:
confusion_matrix, duration = run_algorithm(data, data[data['is_sarcastic']==1], NB)
acc = accuracy(confusion_matrix)
acc

0.9607642442852269

With the entire data to train on, 96% of the sarcastic headlines are correctly classified. This does not compare favorable to the 80% accuracy of the generated data.

Now let's continue this thread and perform the same operation on the serious examples.

In [95]:
# Get the word list for the serious
print(len(cserious))
# get a list of the top words
top=list(it.chain(*cserious.most_common(1000)))[::2] 
# How many headlines only have those words?
x, y = (data['headline'].values, data['is_sarcastic'].values)
only = [all([(word in top) for word in x[i].split(" ")]) for i in range(len(data))]
print(sum(only)) # How many sentences only contain serious words
print(sum(only&(y==0))) # How many sentences are serious and only contain serious words

19053
401
367


In [96]:
serious_data=data[np.logical_and(only,y==0)]
serious_data.head()
# We shall split the input and output into two np arrays
xserious, yserious = (serious_data['headline'].values, serious_data['is_sarcastic'].values)
len(xserious)

367

In [97]:
# Produce the transition matrix and embedding
csmall_serious = Counter(" ".join(xserious).split(" "))
# This embedding is string:int
serious_embedding = embed(csmall_serious.most_common())
# This is just a list of the words
serious_words = list(it.chain(*csmall_serious.most_common()))[::2] 
print(len(serious_embedding))
numwords = len(serious_embedding)
serious_matrix = np.zeros((numwords+1,numwords+1))+0.001

654


In [98]:
train_HMM(xserious, serious_matrix, serious_embedding)
serious_matrix[1:5,1:5]

array([[1.000e-03, 3.001e+00, 1.000e-03, 1.000e-03],
       [1.000e-03, 1.000e-03, 1.000e-03, 1.000e-03],
       [1.000e-03, 1.000e-03, 1.000e-03, 1.000e-03],
       [1.000e-03, 1.001e+00, 1.000e-03, 1.000e-03]])

In [99]:
# Make the np array into probability rows
serious_matrix = normalize(serious_matrix)

In [102]:
# Now let's generate the serious headlines
serious_headlines = [generate_headline(serious_matrix, serious_words) for _ in range(num_gen)]

In [111]:
serious_test = pd.DataFrame({'headline':serious_headlines, 'is_sarcastic':np.zeros(num_gen)})
serious_test.head()

Unnamed: 0,headline,is_sarcastic
0,power party in washington,0.0
1,why chinese parents this incredible reason pe...,0.0
2,11 life change climate change climate talks,0.0
3,our wish you happy,0.0
4,chicago on health care bill refugees but this...,0.0


In [112]:
confusion_matrix, duration = run_algorithm(data, serious_test, NB)
acc = accuracy(confusion_matrix)
acc

0.999

In [108]:
confusion_matrix, duration = run_algorithm(data, data[data['is_sarcastic']==0], NB)
acc = accuracy(confusion_matrix)
acc

0.9648314981648315

It seems that the generator is even worse at producing serious headlines. The generator only makes 1 out of 1000 headlines to be actually taken as serious. This is compared to the overall data where validation on the entire serious data yields 96% accuracy.