# JLouback/nlp-viterbi

Switch branches/tags
Nothing to show
Fetching contributors…
Cannot retrieve contributors at this time
92 lines (81 sloc) 2.58 KB
 __author__="Juliana Louback " import sys from collections import defaultdict import math import logging import emission_counts import transition_counts """ Usage: python viterbi.py ner.counts ngram.counts ner_dev.dat > [output_file] Implementation of the Viterbi algorithm Calculate emission e(x|y) and trigram probability based on data in ner_counts, Read ner_dev.dat, output prediction to [output_file] """ # Get Count(y), Count(x~>y), Count(bigram), Count (trigram) count_xy, count_y = emission_counts.count(sys.argv[1]) bigram_counts, trigram_counts = transition_counts.count(sys.argv[2]) # Go through dev data, predict tag & compute probability based on model above dev_data = file(sys.argv[3],'r') log_probability = 0 # First round for q(*, *, y_1) first_round = True line = dev_data.readline() while line: word = line.strip() # Check for end of sentence if word == '': sys.stdout.write('\n') log_probability = 0 first_round = True else: # Check if there is an existing label associated to the word if word in count_xy: max_probability = 0 for label in list(count_xy[word]): # Calculate e(x|y) emission = float(count_xy[word][label]) / float(count_y[label]) # Calculate q(y| y_i-2, y_i-1) # Check for first round if first_round: y_2 = '*' y_1 = '*' first_round = False bigram = y_2 + ' ' + y_1 trigram = y_2 + ' ' + y_1 + ' ' + label parameter = 0.0000000001 if trigram in trigram_counts: parameter = float(trigram_counts[trigram])/float(bigram_counts[bigram]) probability = parameter*emission if probability > max_probability: max_probability = probability arg_max = label # If Count(x~>y) = 0, use _RARE_ else: for label in list(count_xy['_RARE_']): # Calculate e(_RARE_|y) probability = 0 emission = float(count_xy['_RARE_'][label]) / float(count_y[label]) # Calculate q(y| y_i-2, y_i-1) # Check for first round if first_round: y_2 = '*' y_1 = '*' first_round = False bigram = y_2 + ' ' + y_1 trigram = y_2 + ' ' + y_1 + ' ' + label parameter = 0.0000000001 if trigram in trigram_counts: parameter = float(trigram_counts[trigram])/float(bigram_counts[bigram]) probability = parameter*emission if probability > max_probability: max_probability = probability arg_max = label log_probability = log_probability + math.log(max_probability) sys.stdout.write("{} {} {}\n".format(word,arg_max,log_probability)) #Arrange next round of y_i-2, y_i-1 y_2 = y_1 y_1 = arg_max line = dev_data.readline() dev_data.close()