In [207]:
import pandas as pd
import numpy as np
import re

In [None]:
from google.colab import files
uploads = files.upload()

Saving dev to dev (1)
Saving test to test (1)
Saving train to train (1)


In [208]:
train_data = pd.read_csv("train", sep='\t', header=None, engine='python')
test_data = pd.read_csv("test", sep="\t", header=None, engine="python")
dev_data = pd.read_csv('dev', sep='\t', header=None, engine='python')

In [209]:
vocab = pd.DataFrame(train_data[1])
print(vocab.head())

        1
0  Pierre
1  Vinken
2       ,
3      61
4   years


In [210]:
# Number of rows in the given training data
print("Number of rows in the given training data: ")
print(len(vocab))
print("\n")

912095


In [211]:
#Adding another column just to make it easier to sum up
vocab[2] = 1

In [212]:
# Summing up all the rows based on words index at 1

vocab['sum'] = vocab.groupby([1])[2].transform('sum')

print(vocab.shape)

(912095, 3)


In [213]:
#Replacing all the words indexed at 1 whose count is less than 3 with 'unk'

vocab.loc[vocab['sum'] < 3, 1] = 'unk'

In [214]:
# summing up again since we replaced a few words with 'unk'

vocab['sum'] = vocab.groupby([1])[2].transform('sum')

In [215]:
# dropping duplicate words

vocab = vocab.drop_duplicates([1, 2])

In [216]:
# Number of vocab words after preprocessing
print("Final vocab size:")
print(vocab.shape)
print("\n")

(16920, 3)


In [217]:
# Getting unknown row first and sorting the remaining words based on word count
unk_row = vocab[vocab[1]=='unk']
remaining_vocab = vocab[vocab[1]!='unk'].sort_values(by='sum', ascending=False)

In [218]:
# Writing the vocab to vocab.txt file

vocab_output_file = 'vocab.txt'
with open(vocab_output_file, 'w') as file:
  file.write(str(unk_row[1].values[0]) + "\t" + "0" + "\t" + str(unk_row['sum'].values[0]) + "\n")
  j = 1
  for index, row in remaining_vocab.iterrows():
    file.write(str(row[1]) + "\t" + str(j) + "\t" + str(row['sum']) + '\n')
    j = j+1

In [219]:
# vcoab.txt is saved on google colab, this command helps to download to local system
from google.colab import files
files.download(vocab_output_file)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#Model Learning


In [220]:
#Transition probabilities
from collections import defaultdict

prev_index = 0
prev_tag = '<S>'
end_tag = '<E>'
transition = defaultdict(int)
tag_count = defaultdict(int)
belief_states = {}

for i, row in train_data.iterrows():
  current_index = row[0]
  current_tag = row[2]

  tag_count[current_tag] = tag_count[current_tag] + 1
  if(i==0):
    tag_count[prev_tag] = tag_count[prev_tag] + 1
  if(current_index == 1 and i!=0):
    states = (prev_tag, end_tag)
    transition[states] = transition[states] + 1
    prev_tag = '<S>'
    tag_count[prev_tag] = tag_count[prev_tag] + 1

  #if current_index > prev_index:
  states = (prev_tag, current_tag)
  prev_tag = current_tag
  transition[states] = transition[states] + 1

for states, value in transition.items():
  transition[states] = value/tag_count[states[0]]


In [221]:
print(len(transition))

1416


In [222]:
#emission probabilities
emission = defaultdict(int)
tags = defaultdict(int)

for i, row in train_data.iterrows():
  tag = row[2]
  word = row[1]
  tags[tag] = tags[tag] + 1
  em = (tag, word)
  emission[em] = emission[em] + 1

for em, value in emission.items():
  emission[em] = value/tags[em[0]]


In [223]:
print(len(emission))

50286


In [224]:
# Writing emission and transition probabilities to hmm.json file

import json

transition_dict = {str(k): v for k, v in transition.items()}
emission_dict = {str(k): v for k, v in emission.items()}

model_dict = {
  'transition': transition_dict,
  'emission': emission_dict
}

with open('hmm.json', 'w') as f:
  json.dump(model_dict, f, indent=4)

In [225]:
files.download('hmm.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#Greedy Decoding

In [226]:
def greedy(input_data):
  for i, row in input_data.iterrows():
    max_prob = -1
    current_word = row[1]
    current_index = row[0]
    if(current_index==1):
      prev_tag = '<S>'
    for tag in tags:
      prob = transition.get((prev_tag, tag), 0.01)*emission.get((tag, current_word), 0.0000001)
      if(prob>max_prob):
        max_prob=prob
        max_tag = tag
    prev_tag = max_tag
    input_data.loc[i, 'predict'] = max_tag

def get_accuracy(input_data):
  total = 0
  positives = 0
  for i, row in input_data.iterrows():
    total = total + 1
    if(row[2]==row['predict']):
      positives = positives + 1

  accuracy = positives/total
  return accuracy


In [227]:
greedy(dev_data)

In [228]:
get_accuracy(dev_data)

0.9335802319227734

In [229]:
greedy_dev_output = 'greedy_dev.out'

with open(greedy_dev_output, 'w') as file:
  for index, row in dev_data.iterrows():
    if(row[0] == 1 and index != 0):
      file.write('\n')
    file.write(str(row[0]) + "\t" + row[1] + "\t" + row['predict'] + '\n')

In [230]:
greedy(train_data)

In [231]:
get_accuracy(train_data)

0.9604185967470493

In [232]:
# Predict on test data
greedy(test_data)

In [233]:
# Get greedy output for test data and output into greedy.out file

greedy_test_output = 'greedy.out'

with open(greedy_test_output, 'w') as file:
  for index, row in test_data.iterrows():
    if(row[0] == 1 and index != 0):
      file.write('\n')
    file.write(str(row[0]) + "\t" + row[1] + "\t" + row['predict'] + '\n')

In [234]:
# Downloads dev.out(for my test purpose) and greedy.out file

from google.colab import files
files.download(greedy_test_output)
files.download(greedy_dev_output)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#Viterbi Algorithm

In [235]:
def viterbi(input_data):
  prev_dp = defaultdict(float)
  current_dp = defaultdict(float)
  start_tag = '<S>'
  end_tag = '<E>'
  predictions  = []
  current_sequence = [[] for _ in range(len(tags))]
  prev_sequence = [[] for _ in range(len(tags))]
  prev_value = 0
  current_value = 0
  cvalues = []
  values = []

  for i, row in input_data.iterrows():
    current_index = row[0]
    current_word = row[1]
    if(current_index == 1):
      if(i!=0):
        max_prob = -1
        j = 0
        for tag in tags:
          prob = current_dp[tag]*transition.get((tag, end_tag), 0.01)
          if(prob>max_prob):
            max_index = j
            max_prob = prob
          j = j+1
        predictions.extend(current_sequence[max_index])
        predictions.append("<E>")

      j = 0
      for tag in tags:
        current_dp[tag] = transition.get((start_tag, tag), 0.01)*emission.get((tag, current_word), 0.0001)
        current_sequence[j] = [tag]
        j = j+1
    else:
      prev_dp = defaultdict(float, current_dp)
      for i in range(len(current_sequence)):
        prev_sequence[i] = current_sequence[i][:]
      j = 0
      for c_tag in tags:
        max_prob = -1
        k = 0
        for p_tag in tags:
          prob = prev_dp[p_tag]*transition.get((p_tag, c_tag), 0.01)
          if(prob>max_prob):
            max_index = k
            max_prob = prob
          k = k+1
        current_dp[c_tag] = max_prob*emission.get((c_tag, current_word), 0.00001)
        current_sequence[j] = prev_sequence[max_index][:]
        current_sequence[j].append(c_tag)
        j = j+1


  max_prob = -1
  j = 0
  for tag in tags:
    prob = current_dp[tag]
    if(prob>max_prob):
      max_index = j
      max_prob = prob
    j = j+1
  predictions.extend(current_sequence[max_index])

  return predictions

In [236]:
def get_viterbi_accuracy(input_data, predictions):
  total = 0
  positives = 0
  j = 0
  for i, row in input_data.iterrows():
    total = total + 1
    if (predictions[j] == '<E>'):
      j = j+1
    #print(row[2] + " " + predictions[j])
    if(row[2]==predictions[j]):
      positives = positives + 1
    j = j+1

  print(j)

  accuracy = positives/total
  print(accuracy)

In [237]:
# Run the algorithm on dev data

dev_predictions = viterbi(dev_data)

In [238]:
# Calculate dev accuracies
get_viterbi_accuracy(dev_data, dev_predictions)

137294
0.9282982211159007


In [239]:
# predict on train

train_predictions = viterbi(train_data)

In [240]:
# Calculate train accuracy

get_viterbi_accuracy(train_data, train_predictions)

950312
0.9415817431298275


In [241]:
# Save dev predictions to viterbi_dev.out

viterbi_dev_output = 'viterbi_dev.out'

with open(viterbi_dev_output, 'w') as file:
  j = 0
  for index, row in dev_data.iterrows():
    if(dev_predictions[j] == '<E>'):
      file.write('\n')
      j = j+1
    file.write(str(row[0]) + "\t" + row[1] + "\t" + dev_predictions[j] + '\n')
    j = j+1

In [242]:
#Download dev predictions

files.download(viterbi_dev_output)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [243]:
#Predict on test data

test_predictions = viterbi(test_data)

In [244]:
#Save test predictions to viterbi.txt

viterbi_test_output = 'viterbi.out'

with open(viterbi_test_output, 'w') as file:
  j = 0
  for index, row in dev_data.iterrows():
    if(dev_predictions[j] == '<E>'):
      file.write('\n')
      j = j+1
    file.write(str(row[0]) + "\t" + row[1] + "\t" + dev_predictions[j] + '\n')
    j = j+1

In [245]:
# Download test prediction to local system

files.download(viterbi_test_output)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>