In [1]:
import random
from conlleval import evaluate as conllevaluate
from tqdm import tqdm

directory = 'results_features/'

def decode(input_length, tagset, score, debug=False):
    """
    Compute the highest scoring sequence according to the scoring function.
    """
    viterbi = [[0 for _ in range(input_length)] for _ in range(len(tagset))]
    backpointer = [[0 for _ in range(input_length)] for _ in range(len(tagset))]
    best_path = []

    for i, tag in enumerate(tagset):
        viterbi[i][1] = score(tag, "<START>", 1)
    
    if debug: print(viterbi)
        
    for t in range(2, input_length - 1):
        for s, tag in enumerate(tagset):
            max_val = 0
            max_index = 0
            for b, prev_tag in enumerate(tagset):
                curr_val = viterbi[b][t - 1] + score(tag, prev_tag, t)
                if curr_val > max_val:
                    max_val = curr_val
                    max_index = b
            viterbi[s][t] = max_val
            backpointer[s][t] = max_index

    for i, tag in enumerate(tagset):
        viterbi[i][input_length - 1] = viterbi[i][input_length - 2] + score("<STOP>", tag, input_length - 1)

    best_path_prob = 0
    index_to_best_path = 0
    for i in range(len(tagset)):
        if viterbi[i][-1] > best_path_prob:
            best_path_prob = viterbi[i][-1]
            index_to_best_path = i

    best_path = ["<STOP>"]
    for i in range(input_length - 1, 0, -1):
        if i == 1:
            best_path.insert(0, "<START>")
        else:
            index_to_best_path = backpointer[index_to_best_path][i]
            best_path.insert(0, tagset[index_to_best_path])
            if debug: print(best_path)
    return best_path

In [2]:
def compute_score(tag_seq, input_length, score):
    """
    Computes the total score of a tag sequence 
    """
    total_score = 0
    for i in range(1, input_length):
        total_score += score(tag_seq[i], tag_seq[i - 1], i)
    return total_score


def compute_features(tag_seq, input_length, features):
    """
    Compute f(xi, yi)
    """
    feats = FeatureVector({})
    for i in range(1, input_length):
        feats.times_plus_equal(1, features.compute_features(tag_seq[i], tag_seq[i - 1], i))
    return feats

In [3]:
import os 

os.path.exists('ner.train')

True

In [5]:
def sgd(training_size, epochs, gradient, parameters, training_observer):
    """
    Stochastic gradient descent
    """
    for i in range(epochs):
        print("epoch: ", i)
        indices = [i for i in range(training_size)]
        random.shuffle(indices)
        for t in tqdm(indices):
            parameters.times_plus_equal(-1, gradient(t))
        print("Running the training observer")
        training_observer(i, parameters)
    return parameters


def train(data, feature_names, tagset, epochs):
    """
    Trains the model on the data and returns the parameters
    """
    parameters = FeatureVector({})   

    def perceptron_gradient(i):
        inputs = data[i]
        input_len = len(inputs['tokens'])
        gold_labels = inputs['gold_tags']
        features = Features(inputs, feature_names)

        
        def score(cur_tag, pre_tag, i):
            return parameters.dot_product(features.compute_features(cur_tag, pre_tag, i))

        tags = decode(input_len, tagset, score)

        fvector = compute_features(tags, input_len, features)           
        fvector.times_plus_equal(-1, compute_features(gold_labels, input_len, features))   
        return fvector

    def training_observer(epoch, parameters):
        """
        Evaluates the parameters on the development data, and writes out the parameters to a 'model.iter'+epoch and
        the predictions to 'ner.dev.out'+epoch.
        """
        dev_data = read_data('ner.dev')[:100]
        (_, _, f1) = evaluate(dev_data, parameters, feature_names, tagset)
        write_predictions(os.path.join(directory,'ner.dev.out'+str(epoch)), dev_data, parameters, feature_names, tagset)
        parameters.write_to_file(os.path.join(directory, 'model.iter'+str(epoch)))
        return f1

    
    return sgd(len(data), epochs, perceptron_gradient, parameters, training_observer)


def predict(inputs, input_len, parameters, feature_names, tagset):
    features = Features(inputs, feature_names)

    def score(cur_tag, pre_tag, i):
        return parameters.dot_product(features.compute_features(cur_tag, pre_tag, i))

    return decode(input_len, tagset, score)


def make_data_point(sent):
    dic = {}
    sent = [s.strip().split() for s in sent]
    dic['tokens'] = ['<START>'] + [s[0] for s in sent] + ['<STOP>']
    dic['pos'] = ['<START>'] + [s[1] for s in sent] + ['<STOP>']
    dic['NP_chunk'] = ['<START>'] + [s[2] for s in sent] + ['<STOP>']
    dic['gold_tags'] = ['<START>'] + [s[3] for s in sent] + ['<STOP>']
    return dic

def read_data(filename):
    data = []
    
    with open(filename, 'r') as f:
        sent = []
        for line in f.readlines():
            if line.strip():
                sent.append(line)
            else:
                data.append(make_data_point(sent))
                sent = []
        data.append(make_data_point(sent))

    return data


def write_predictions(out_filename, all_inputs, parameters, feature_names, tagset):
    with open(out_filename, 'w', encoding='utf-8') as f:
        for inputs in all_inputs:
            input_len = len(inputs['tokens'])
            tag_seq = predict(inputs, input_len, parameters, feature_names, tagset)
            for i, tag in enumerate(tag_seq[1:-1]):  
                f.write(' '.join([inputs['tokens'][i+1], inputs['pos'][i+1], inputs['NP_chunk'][i+1], inputs['gold_tags'][i+1], tag])+'\n') # i + 1 because of <START>
            f.write('\n')


def evaluate(data, parameters, feature_names, tagset):
    all_gold_tags = [ ]
    all_predicted_tags = [ ]
    for inputs in tqdm(data):
        all_gold_tags.extend(inputs['gold_tags'][1:-1])  
        input_len = len(inputs['tokens'])
        all_predicted_tags.extend(predict(inputs, input_len, parameters, feature_names, tagset)[1:-1]) # deletes <START> and <STOP>
    return conllevaluate(all_gold_tags, all_predicted_tags)

def test_decoder():
    
    tagset = ['NN', 'VB']    

    def score_wrap(cur_tag, pre_tag, i):
        retval = score(cur_tag, pre_tag, i)
        print('Score('+cur_tag+','+pre_tag+','+str(i)+') returning '+str(retval))
        return retval

    def score(cur_tag, pre_tag, i):
        if i == 0:
            print("ERROR: Don't call score for i = 0 (that points to <START>, with nothing before it)")
        if i == 1:
            if pre_tag != '<START>':
                print("ERROR: Previous tag should be <START> for i = 1. Previous tag = "+pre_tag)
            if cur_tag == 'NN':
                return 6
            if cur_tag == 'VB':
                return 4
        if i == 2:
            if cur_tag == 'NN' and pre_tag == 'NN':
                return 4
            if cur_tag == 'NN' and pre_tag == 'VB':
                return 9
            if cur_tag == 'VB' and pre_tag == 'NN':
                return 5
            if cur_tag == 'VB' and pre_tag == 'VB':
                return 0
        if i == 3:
            if cur_tag != '<STOP>':
                print('ERROR: Current tag at i = 3 should be <STOP>. Current tag = '+cur_tag)
            if pre_tag == 'NN':
                return 1
            if pre_tag == 'VB':
                return 1

    predicted_tag_seq = decode(4, tagset, score_wrap)
    print('Predicted tag sequence should be = <START> VB NN <STOP>')
    print('Predicted tag sequence = '+' '.join(predicted_tag_seq))
    print("Score of ['<START>','VB','NN','<STOP>'] = "+str(compute_score(['<START>','VB','NN','<STOP>'], 4, score)))
    print('Max score should be = 14')
    print('Max score = '+str(compute_score(predicted_tag_seq, 4, score)))

In [6]:
def main_predict(data_filename, model_filename, use_four_features=False):
    data = read_data(data_filename)
    parameters = FeatureVector({})
    parameters.read_from_file(model_filename)

    tagset = ['B-PER', 'B-LOC', 'B-ORG', 'B-MISC', 'I-PER', 'I-LOC', 'I-ORG', 'I-MISC', 'O']

    feature_names = ['tag', 'prev_tag', 'current_word', 'curr_pos_tag', 'shape_curr_word', 'len_k', 'in_gazetteer', 'start_cap']

    write_predictions(data_filename+'.out', data, parameters, feature_names, tagset)
    evaluate(data, parameters, feature_names, tagset)

    return


def main_train():
    print('Reading training data')
    train_data = read_data('ner.train')[:1100]
    tagset = ['B-PER', 'B-LOC', 'B-ORG', 'B-MISC', 'I-PER', 'I-LOC', 'I-ORG', 'I-MISC', 'O']
    feature_names = ['tag', 'prev_tag', 'current_word', 'curr_pos_tag', 'shape_curr_word', 'len_k', 'in_gazetteer', 'start_cap']
    
    print('Training...')
    parameters = train(train_data, feature_names, tagset, epochs=10)
    print('Training done')
    dev_data = read_data('ner.dev')[:1100]
    evaluate(dev_data, parameters, feature_names, tagset)
    test_data = read_data('ner.test')[:1100]
    
    evaluate(test_data, parameters, feature_names, tagset)
    parameters.write_to_file('model')

    return 



In [7]:
class Features(object):
    def __init__(self, inputs, feature_names):
        self.feature_names = feature_names
        self.inputs = inputs
        self.gazette_dict = {}

        with open('gazetteer.txt', 'r') as file:
            for row in file:
                words = row.split(' ')
                value = words[0]
                for w in words[1:]:
                    if (w in self.gazette_dict.keys()):
                        self.gazette_dict[w].append(value)
                    else:
                        self.gazette_dict[w] = [value]

    def compute_features(self, cur_tag, pre_tag, i):

        feats = FeatureVector({})
        curr_word = self.inputs['tokens'][i]
        len_curr_word = len(self.inputs['tokens'][i])
        
        if 'tag' in self.feature_names:
            feats.times_plus_equal(1, FeatureVector({'t='+cur_tag: 1}))
        if 'prev_tag' in self.feature_names:
            feats.times_plus_equal(1, FeatureVector({'ti='+cur_tag+"+ti-1="+pre_tag: 1}))
        if 'current_word' in self.feature_names:
            feats.times_plus_equal(1, FeatureVector({'t='+cur_tag+'+w='+self.inputs['tokens'][i]: 1}))
        if 'curr_pos_tag' in self.feature_names:
            feats.times_plus_equal(1, FeatureVector({'t='+cur_tag+'+pi='+self.inputs['pos'][i]: 1}))
        
        if 'shape_curr_word' in self.feature_names:
            word_shape = ''.join(['a' if c.isalpha() else 'A' if c.isupper() else 'd' for c in curr_word])
            feats.times_plus_equal(1, FeatureVector({'t='+cur_tag+'si'+word_shape: 1}))


        if 'len_k' in self.feature_names:
            for j in range(1, min(5, len(curr_word) + 1)): 
                feats.times_plus_equal(1, FeatureVector({'t='+cur_tag+'+PRE'+str(j)+'='+curr_word[:j]: 1}))


       
        if 'in_gazetteer' in self.feature_names:
            if (curr_word) in self.gazette_dict.keys():
                if self.gazette_dict[curr_word] == cur_tag:
                    feats.times_plus_equal(1, FeatureVector({'t='+cur_tag+'+GAZ='+'True': 1}))
            

        if 'start_cap' in self.feature_names:
            if(curr_word[0].isupper()):
                feats.times_plus_equal(1, FeatureVector({'t='+cur_tag+'+CAP='+'True': 1}))
        
        return feats



In [12]:
class FeatureVector(object):

    def __init__(self, fdict):
        self.fdict = fdict

    def times_plus_equal(self, scalar, v2):
        
        try: 
            for key, value in v2.fdict.items():
                self.fdict[key] = scalar * value + self.fdict.get(key, 0)
        except:
            print(v2)

    def dot_product(self, v2):
        retval = 0
        for key, value in v2.fdict.items():
            retval += value * self.fdict.get(key, 0)
        return retval

    def write_to_file(self, filename):
        print('Writing to ' + filename)
        with open(filename, 'w', encoding='utf-8') as f:
            for key, value in self.fdict.items():
                f.write('{} {}\n'.format(key, value))


    def read_from_file(self, filename):
        self.fdict = {}
        with open(filename, 'r') as f:
            for line in f.readlines():
                txt = line.split()
                self.fdict[txt[0]] = float(txt[1])

main_train()   
main_predict('ner.dev', 'model')  # Uncomment to predict on 'dev.ner' using the model 'model' (need to implement 'decode' function)


Reading training data
Training...
epoch:  0


100%|██████████| 1100/1100 [00:52<00:00, 20.83it/s]


Running the training observer


100%|██████████| 100/100 [00:04<00:00, 20.24it/s]


processed 1314 tokens with 189 phrases; found: 313 phrases; correct: 67.
accuracy:  40.38%; (non-O)
accuracy:  77.32%; precision:  21.41%; recall:  35.45%; FB1:  26.69
              LOC: precision:  17.80%; recall:  39.62%; FB1:  24.56  118
             MISC: precision:  15.38%; recall:  18.18%; FB1:  16.67  13
              ORG: precision:  50.00%; recall:  22.86%; FB1:  31.37  32
              PER: precision:  18.67%; recall:  50.91%; FB1:  27.32  150
Writing to results_features/model.iter0
epoch:  1


100%|██████████| 1100/1100 [00:53<00:00, 20.65it/s]


Running the training observer


100%|██████████| 100/100 [00:04<00:00, 20.36it/s]


processed 1314 tokens with 189 phrases; found: 276 phrases; correct: 53.
accuracy:  31.92%; (non-O)
accuracy:  77.02%; precision:  19.20%; recall:  28.04%; FB1:  22.80
              LOC: precision:  20.22%; recall:  33.96%; FB1:  25.35  89
             MISC: precision:  33.33%; recall:  18.18%; FB1:  23.53  6
              ORG: precision:  37.84%; recall:  20.00%; FB1:  26.17  37
              PER: precision:  13.19%; recall:  34.55%; FB1:  19.10  144
Writing to results_features/model.iter1
epoch:  2


100%|██████████| 1100/1100 [00:53<00:00, 20.59it/s]


Running the training observer


100%|██████████| 100/100 [00:04<00:00, 20.30it/s]


processed 1314 tokens with 189 phrases; found: 277 phrases; correct: 42.
accuracy:  26.54%; (non-O)
accuracy:  75.65%; precision:  15.16%; recall:  22.22%; FB1:  18.03
              LOC: precision:  18.52%; recall:  37.74%; FB1:  24.84  108
             MISC: precision:  16.67%; recall:  27.27%; FB1:  20.69  18
              ORG: precision:  15.38%; recall:   2.86%; FB1:   4.82  13
              PER: precision:  12.32%; recall:  30.91%; FB1:  17.62  138
Writing to results_features/model.iter2
epoch:  3


100%|██████████| 1100/1100 [00:53<00:00, 20.44it/s]


Running the training observer


100%|██████████| 100/100 [00:05<00:00, 19.68it/s]


processed 1314 tokens with 189 phrases; found: 298 phrases; correct: 78.
accuracy:  48.08%; (non-O)
accuracy:  78.77%; precision:  26.17%; recall:  41.27%; FB1:  32.03
              LOC: precision:  29.03%; recall:  33.96%; FB1:  31.30  62
             MISC: precision:  28.57%; recall:  36.36%; FB1:  32.00  14
              ORG: precision:  36.21%; recall:  30.00%; FB1:  32.81  58
              PER: precision:  21.34%; recall:  63.64%; FB1:  31.96  164
Writing to results_features/model.iter3
epoch:  4


100%|██████████| 1100/1100 [00:53<00:00, 20.38it/s]


Running the training observer


100%|██████████| 100/100 [00:05<00:00, 18.93it/s]


processed 1314 tokens with 189 phrases; found: 271 phrases; correct: 77.
accuracy:  44.23%; (non-O)
accuracy:  79.83%; precision:  28.41%; recall:  40.74%; FB1:  33.48
              LOC: precision:  34.88%; recall:  28.30%; FB1:  31.25  43
             MISC: precision:  15.00%; recall:  27.27%; FB1:  19.35  20
              ORG: precision:  54.39%; recall:  44.29%; FB1:  48.82  57
              PER: precision:  18.54%; recall:  50.91%; FB1:  27.18  151
Writing to results_features/model.iter4
epoch:  5


100%|██████████| 1100/1100 [00:53<00:00, 20.54it/s]


Running the training observer


100%|██████████| 100/100 [00:04<00:00, 20.21it/s]


processed 1314 tokens with 189 phrases; found: 278 phrases; correct: 56.
accuracy:  36.15%; (non-O)
accuracy:  77.09%; precision:  20.14%; recall:  29.63%; FB1:  23.98
              LOC: precision:  25.71%; recall:  33.96%; FB1:  29.27  70
             MISC: precision:  13.33%; recall:  18.18%; FB1:  15.38  15
              ORG: precision:  21.05%; recall:  11.43%; FB1:  14.81  38
              PER: precision:  18.06%; recall:  50.91%; FB1:  26.67  155
Writing to results_features/model.iter5
epoch:  6


100%|██████████| 1100/1100 [00:53<00:00, 20.61it/s]


Running the training observer


100%|██████████| 100/100 [00:04<00:00, 20.33it/s]


processed 1314 tokens with 189 phrases; found: 285 phrases; correct: 63.
accuracy:  35.77%; (non-O)
accuracy:  76.64%; precision:  22.11%; recall:  33.33%; FB1:  26.58
              LOC: precision:  29.03%; recall:  33.96%; FB1:  31.30  62
             MISC: precision:  21.43%; recall:  27.27%; FB1:  24.00  14
              ORG: precision:  33.85%; recall:  31.43%; FB1:  32.59  65
              PER: precision:  13.89%; recall:  36.36%; FB1:  20.10  144
Writing to results_features/model.iter6
epoch:  7


100%|██████████| 1100/1100 [00:53<00:00, 20.66it/s]


Running the training observer


100%|██████████| 100/100 [00:04<00:00, 20.28it/s]


processed 1314 tokens with 189 phrases; found: 287 phrases; correct: 60.
accuracy:  39.23%; (non-O)
accuracy:  77.17%; precision:  20.91%; recall:  31.75%; FB1:  25.21
              LOC: precision:  19.15%; recall:  33.96%; FB1:  24.49  94
             MISC: precision:  40.00%; recall:  18.18%; FB1:  25.00  5
              ORG: precision:  38.10%; recall:  11.43%; FB1:  17.58  21
              PER: precision:  19.16%; recall:  58.18%; FB1:  28.83  167
Writing to results_features/model.iter7
epoch:  8


100%|██████████| 1100/1100 [00:53<00:00, 20.58it/s]


Running the training observer


100%|██████████| 100/100 [00:04<00:00, 20.44it/s]


processed 1314 tokens with 189 phrases; found: 272 phrases; correct: 55.
accuracy:  31.15%; (non-O)
accuracy:  76.03%; precision:  20.22%; recall:  29.10%; FB1:  23.86
              LOC: precision:  26.98%; recall:  32.08%; FB1:  29.31  63
             MISC: precision:  33.33%; recall:   9.09%; FB1:  14.29  3
              ORG: precision:  33.93%; recall:  27.14%; FB1:  30.16  56
              PER: precision:  12.00%; recall:  32.73%; FB1:  17.56  150
Writing to results_features/model.iter8
epoch:  9


100%|██████████| 1100/1100 [00:53<00:00, 20.60it/s]


Running the training observer


100%|██████████| 100/100 [00:04<00:00, 20.38it/s]


processed 1314 tokens with 189 phrases; found: 295 phrases; correct: 72.
accuracy:  41.15%; (non-O)
accuracy:  77.47%; precision:  24.41%; recall:  38.10%; FB1:  29.75
              LOC: precision:  31.67%; recall:  35.85%; FB1:  33.63  60
             MISC: precision:  22.22%; recall:  18.18%; FB1:  20.00  9
              ORG: precision:  36.00%; recall:  38.57%; FB1:  37.24  75
              PER: precision:  15.89%; recall:  43.64%; FB1:  23.30  151
Writing to results_features/model.iter9
Training done


100%|██████████| 1100/1100 [00:54<00:00, 20.09it/s]


processed 15257 tokens with 1808 phrases; found: 2903 phrases; correct: 722.
accuracy:  44.52%; (non-O)
accuracy:  81.39%; precision:  24.87%; recall:  39.93%; FB1:  30.65
              LOC: precision:  50.30%; recall:  57.02%; FB1:  53.45  662
             MISC: precision:  33.33%; recall:  11.06%; FB1:  16.60  66
              ORG: precision:  23.64%; recall:  32.41%; FB1:  27.34  495
              PER: precision:  14.88%; recall:  37.65%; FB1:  21.33  1680


100%|██████████| 1100/1100 [00:53<00:00, 20.43it/s]


processed 13063 tokens with 1888 phrases; found: 3058 phrases; correct: 799.
accuracy:  49.04%; (non-O)
accuracy:  77.64%; precision:  26.13%; recall:  42.32%; FB1:  32.31
              LOC: precision:  47.01%; recall:  56.57%; FB1:  51.35  568
             MISC: precision:  38.17%; recall:  24.39%; FB1:  29.76  131
              ORG: precision:  38.45%; recall:  34.62%; FB1:  36.43  489
              PER: precision:  15.72%; recall:  44.01%; FB1:  23.17  1870
Writing to model


100%|██████████| 3466/3466 [02:54<00:00, 19.85it/s]


processed 51578 tokens with 5917 phrases; found: 9216 phrases; correct: 2486.
accuracy:  46.20%; (non-O)
accuracy:  82.53%; precision:  26.97%; recall:  42.01%; FB1:  32.86
              LOC: precision:  52.03%; recall:  54.04%; FB1:  53.02  1901
             MISC: precision:  47.08%; recall:  25.60%; FB1:  33.17  497
              ORG: precision:  31.95%; recall:  35.72%; FB1:  33.73  1499
              PER: precision:  14.74%; recall:  42.79%; FB1:  21.93  5319


In [9]:
test_decoder()

Score(NN,<START>,1) returning 6
Score(VB,<START>,1) returning 4
Score(NN,NN,2) returning 4
Score(NN,VB,2) returning 9
Score(VB,NN,2) returning 5
Score(VB,VB,2) returning 0
Score(<STOP>,NN,3) returning 1
Score(<STOP>,VB,3) returning 1
Predicted tag sequence should be = <START> VB NN <STOP>
Predicted tag sequence = <START> VB NN <STOP>
Score of ['<START>','VB','NN','<STOP>'] = 14
Max score should be = 14
Max score = 14


## Training using SGD (trained on 1100 training examples)

In [None]:
main_predict('ner.dev', 'results_features/model.iter9')

100%|███████████████████████████████████████████████████████████████████████████████| 3466/3466 [04:18<00:00, 13.40it/s]

processed 51578 tokens with 5917 phrases; found: 7975 phrases; correct: 1949.
accuracy:  36.78%; (non-O)
accuracy:  81.79%; precision:  24.44%; recall:  32.94%; FB1:  28.06
              LOC: precision:  55.78%; recall:  46.17%; FB1:  50.52  1515
             MISC: precision:  30.63%; recall:  20.68%; FB1:  24.69  617
              ORG: precision:  37.97%; recall:  18.12%; FB1:  24.53  640
              PER: precision:  12.92%; recall:  36.68%; FB1:  19.10  5203





In [None]:
main_predict('ner.test', 'results_features/model.iter9')

100%|███████████████████████████████████████████████████████████████████████████████| 3684/3684 [04:46<00:00, 12.86it/s]

processed 46666 tokens with 5616 phrases; found: 8312 phrases; correct: 1757.
accuracy:  35.85%; (non-O)
accuracy:  79.24%; precision:  21.14%; recall:  31.29%; FB1:  25.23
              LOC: precision:  53.43%; recall:  46.34%; FB1:  49.63  1445
             MISC: precision:  19.22%; recall:  15.55%; FB1:  17.19  567
              ORG: precision:  35.65%; recall:  15.24%; FB1:  21.35  704
              PER: precision:  11.17%; recall:  39.01%; FB1:  17.37  5596





In [None]:
data = read_data('ner.test')[:4]
parameters = FeatureVector({})
parameters.read_from_file('results_features/model.iter9')

tagset = ['B-PER', 'B-LOC', 'B-ORG', 'B-MISC', 'I-PER', 'I-LOC', 'I-ORG', 'I-MISC', 'O']

feature_names = ['tag', 'prev_tag', 'current_word', 'curr_pos_tag', 'shape_curr_word', 'len_k', 'in_gazetteer', 'start_cap']

print(data[0])

{'tokens': ['<START>', 'SOCCER', '-', 'JAPAN', 'GET', 'LUCKY', 'WIN', ',', 'CHINA', 'IN', 'SURPRISE', 'DEFEAT', '.', '<STOP>'], 'pos': ['<START>', 'NN', ':', 'NNP', 'VB', 'NNP', 'NNP', ',', 'NNP', 'IN', 'DT', 'NN', '.', '<STOP>'], 'NP_chunk': ['<START>', 'I-NP', 'O', 'I-NP', 'I-VP', 'I-NP', 'I-NP', 'O', 'I-NP', 'I-PP', 'I-NP', 'I-NP', 'O', '<STOP>'], 'gold_tags': ['<START>', 'O', 'O', 'I-LOC', 'O', 'O', 'O', 'O', 'I-PER', 'O', 'O', 'O', 'O', '<STOP>']}


In [None]:
all_gold_tags = [ ]
all_predicted_tags = [ ]
for inputs in tqdm(data):
    all_gold_tags.append(inputs['gold_tags'][1:-1])  # deletes <START> and <STOP>
    input_len = len(inputs['tokens'])
    all_predicted_tags.append(predict(inputs, input_len, parameters, feature_names, tagset)[1:-1]) # deletes <START> and <STOP>

100%|█████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 10.08it/s]


In [None]:
display_id = 3

In [None]:
print(data[display_id]['tokens'])

['<START>', 'Japan', 'began', 'the', 'defence', 'of', 'their', 'Asian', 'Cup', 'title', 'with', 'a', 'lucky', '2-1', 'win', 'against', 'Syria', 'in', 'a', 'Group', 'C', 'championship', 'match', 'on', 'Friday', '.', '<STOP>']


In [None]:
print(all_gold_tags[display_id])

['I-LOC', 'O', 'O', 'O', 'O', 'O', 'I-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [None]:
print(all_predicted_tags[display_id])

['I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'O', 'I-ORG', 'O', 'O', 'O', 'O', 'I-MISC', 'B-PER']


In [1]:
!cat "results_features/model.iter9" | awk '{print $2, $1}' | sort -gr > "results_features/model.sorted.txt"

The file `model.sorted.txt` will be viewable in your Google Drive folder.