    Components of a Hidden Markov Model:
    1. States - POS
    2. Observations - headlines
    
    Probablities required that relate the above two components:
    1. Initial Probability: An initial probability distribution over states
    2. Final Probability: A final probability distribution over states
    3. Transition Probability: a matrix A with the probabilities from going from one state to another
    4. Emission probability: a matrix B with the probabilities of an observation being generated from a state

In [178]:
# -- Importing Libraries --
import nltk
import pandas as pd
from collections import defaultdict
import pprint
from nltk.tokenize import TweetTokenizer
import string
import pprint
import numpy as np

# --Reading the dataset in --
ds = pd.read_csv('india-news-headlines.csv') 
ds = ds.drop(columns=['publish_date', 'headline_category'])


In [179]:
ds = ds.head(1000)

# -- Text Processing --
tknzr = TweetTokenizer()

ds["headline_text"] = ds["headline_text"].apply(lambda x: tknzr.tokenize(x))
ds["headline_text"] = ds["headline_text"].apply(lambda word: [word for word in word if word.isalpha()])


#Dropping rows that contain empty list
drop_list = []
for idx in range(0, len(ds)):
    if(ds['headline_text'].loc[idx] == []):
        drop_list.append(idx)

ds = ds.drop(drop_list)
ds = ds.reset_index(drop=True)

ds['headline_tags'] = ds['headline_text'].apply(nltk.pos_tag)
ds['headline_tags'] = ds['headline_tags'].apply(lambda x: [x[1] for x in x])



In [180]:
# -- Building Text and Tags Dictionaries --
tag_freq = defaultdict(int)
words_by_tag = defaultdict(list)
emi_freq = defaultdict(list)


for idx in range(0, len(ds)):
    text = ds.iloc[idx]['headline_text']
    tags = ds.iloc[idx]['headline_tags']

    for index, tag in enumerate(tags):
        tag_freq[tag] += 1
        words_by_tag[tag].append(text[index])

In [181]:
# -- All headline text (observations) in a list --
l = ds['headline_text'].tolist()
headline_text_list = []
for sublist in l:
    for item in sublist:
        headline_text_list.append(item)

In [182]:
# -- Building Probability Matrices --
tags_list = sorted(tag_freq.keys())
tags_tuple = tuple(tags_list)

trans_prob_matrix = (pd.DataFrame(columns=tags_list, index=tags_list)).fillna(0)
initial_prob_matrix = pd.DataFrame(columns=['Count'], index=tags_list).fillna(0)


In [183]:
# -- Building Initial Probability Matrix -- 

for idx in ds['headline_tags']:
    initial_prob_matrix.loc[idx[0]] =  initial_prob_matrix.loc[idx[0]]+1
initial_prob_matrix = initial_prob_matrix['Count'].divide(len(ds))


In [184]:
# -- Building Transition Probability Matrix -- 
import operator


for tags in ds['headline_tags']: 
    for tag_idx in range(0, len(tags)-1):
        trans_prob_matrix[tags[tag_idx+1]][tags[tag_idx]] = trans_prob_matrix[tags[tag_idx+1]][tags[tag_idx]]+1
for index, row in trans_prob_matrix.iterrows():
    trans_prob_matrix.loc[index] /= trans_prob_matrix.loc[index].sum()


In [185]:
# -- Supporting Data Structures --
# -- word_freq = {'tag' : ['word1', 'word2'...]}
# -- 

all_words_l = []

# Getting a list of all unique words in the headlines. This list will be used to get columns for the emission probability matrix
for key in words_by_tag: #Get the list of words for each tag
    l = words_by_tag[key]
    u_e = sorted(set(l)) #Get unique words from that list
    all_words_l.append(u_e)
    words_by_tag[key] = u_e
all_words_l = [item for sublist in all_words_l for item in sublist]


In [186]:
# -- Building Transition Probability Matrix -- 

emission_prob_matrix = pd.DataFrame(columns=all_words_l, index=tags_list).fillna(0)

for key in words_by_tag: #Get the list of words for each tag
    l_words = words_by_tag[key]
    for idx in range(0, len(l_words)):
        word = l_words[idx]
        emission_prob_matrix.loc[key][word] = emission_prob_matrix.loc[key][word]+1
for index, row in emission_prob_matrix.iterrows():
    emission_prob_matrix.loc[index] /= emission_prob_matrix.loc[index].sum()


In [187]:
N = tags_list
M = ds["headline_text"]
pi = initial_prob_matrix
A = trans_prob_matrix
B = emission_prob_matrix
O = ''

In [198]:
# -- T : number of observations in the sequence --
T = 10

# -- STEP 1: Choosing an initial state according to the initial state distribution - pi --
q1 = np.random.choice(N, 1, p=pi)
# -- STEP 2: set t=1
t = 1

# -- STEP 3: Choose observation according to the emission probability
p = np.array(list(B.loc[q1[0]]))
p /=  p.sum()
O_t = np.random.choice(B.columns.values, 1, p=p)
O = O_t[0]+' '

# -- STEP 4: Transit to new state according to the state transition probability

t=t+1
qt = q1
print(qt)
while t < T:
    print(qt[0])
    qt = np.random.choice(N, 1, p=A.loc[qt[0]])
    p = np.array(list(B.loc[qt[0]]))
    p /=  p.sum()
    O_t = np.random.choice(B.columns.values, 1, p=p)
    O = O + O_t + ' '
    t+=1
print(O)

['DT']
DT
NNP
NNP
NNP
TO
VB
IN
NNP
['this D CTBT Gaya To put if Drumming Angolan ']
