In [None]:
import random
from conlleval import evaluate as conllevaluate
from tqdm import tqdm
import math 

directory = 'adagrad_results/'

def decode(input_length, tagset, score, debug=False):
    """
    Compute the highest scoring sequence according to the scoring function.
    """
    viterbi = [[0 for _ in range(input_length)] for _ in range(len(tagset))]
    backpointer = [[0 for _ in range(input_length)] for _ in range(len(tagset))]
    best_path = []

    for i, tag in enumerate(tagset):
        viterbi[i][1] = score(tag, "<START>", 1)
    
    if debug: print(viterbi)
        
    for t in range(2, input_length - 1):
        for s, tag in enumerate(tagset):
            max_val = 0
            max_index = 0
            for b, prev_tag in enumerate(tagset):
                curr_val = viterbi[b][t - 1] + score(tag, prev_tag, t)
                if curr_val > max_val:
                    max_val = curr_val
                    max_index = b
            viterbi[s][t] = max_val
            backpointer[s][t] = max_index

    for i, tag in enumerate(tagset):
        viterbi[i][input_length - 1] = viterbi[i][input_length - 2] + score("<STOP>", tag, input_length - 1)

    best_path_prob = 0
    index_to_best_path = 0
    for i in range(len(tagset)):
        if viterbi[i][-1] > best_path_prob:
            best_path_prob = viterbi[i][-1]
            index_to_best_path = i

    best_path = ["<STOP>"]
    for i in range(input_length - 1, 0, -1):
        if i == 1:
            best_path.insert(0, "<START>")
        else:
            index_to_best_path = backpointer[index_to_best_path][i]
            best_path.insert(0, tagset[index_to_best_path])
    return best_path

In [None]:
def compute_score(tag_seq, input_length, score):
    """
    Computes the total score of a tag sequence 
    """
    total_score = 0
    for i in range(1, input_length):
        total_score += score(tag_seq[i], tag_seq[i - 1], i)
    return total_score


def compute_features(tag_seq, input_length, features):
    """
    Compute f(xi, yi)
    """
    feats = FeatureVector({})
    for i in range(1, input_length):
        feats.times_plus_equal(1, features.compute_features(tag_seq[i], tag_seq[i - 1], i))
    return feats

In [3]:
import os 

os.path.exists('ner.train')

True

In [None]:


def adagrad(training_size, epochs, gradient, parameters, training_observer, alpha=1.0, epsilon=1e-8, debug=False):
    parameters_list, temp  = [], FeatureVector({})
    data_indices = [i for i in range(training_size)]
    random.shuffle(data_indices)
    for i in range(epochs):
        print(f'EPOCH {i}')
        for t in tqdm(data_indices):
            if debug: print(t)
            temp.power_plus(2, gradient(t))
            parameters.times_plus_equal(-1, gradient(t).sqrt_div(temp, 1e-8))
        print(training_observer(i, parameters))




def train(data, feature_names, tagset, epochs):
    """
    Trains the model on the data and returns the parameters
    """
    parameters = FeatureVector({})  

    def perceptron_gradient(i):
        inputs = data[i]
        input_len = len(inputs['tokens'])
        gold_labels = inputs['gold_tags']
        features = Features(inputs, feature_names)

        def score(cur_tag, pre_tag, i):
            return parameters.dot_product(features.compute_features(cur_tag, pre_tag, i))
        tags = decode(input_len, tagset, score)
        fvector = compute_features(tags, input_len, features)           
        fvector.times_plus_equal(-1, compute_features(gold_labels, input_len, features))  
        return fvector

    def training_observer(epoch, parameters):
        """
        Evaluates the parameters on the development data, and writes out the parameters to a 'model.iter'+epoch and
        the predictions to 'ner.dev.out'+epoch.
        """
        dev_data = read_data('ner.dev')
        (_, _, f1) = evaluate(dev_data, parameters, feature_names, tagset)
        write_predictions('ner.dev.out'+str(epoch), dev_data, parameters, feature_names, tagset)
        parameters.write_to_file(os.path.join(directory,'model.iter'+str(epoch)))
        return f1

    return adagrad(len(data), epochs, perceptron_gradient, parameters, training_observer)



def predict(inputs, input_len, parameters, feature_names, tagset):
    features = Features(inputs, feature_names)

    def score(cur_tag, pre_tag, i):
        return parameters.dot_product(features.compute_features(cur_tag, pre_tag, i))

    return decode(input_len, tagset, score)


def make_data_point(sent):
    dic = {}
    sent = [s.strip().split() for s in sent]
    dic['tokens'] = ['<START>'] + [s[0] for s in sent] + ['<STOP>']
    dic['pos'] = ['<START>'] + [s[1] for s in sent] + ['<STOP>']
    dic['NP_chunk'] = ['<START>'] + [s[2] for s in sent] + ['<STOP>']
    dic['gold_tags'] = ['<START>'] + [s[3] for s in sent] + ['<STOP>']
    return dic

def read_data(filename):
    data = []
    
    with open(filename, 'r') as f:
        sent = []
        for line in f.readlines():
            if line.strip():
                sent.append(line)
            else:
                data.append(make_data_point(sent))
                sent = []
        data.append(make_data_point(sent))

    return data


def write_predictions(out_filename, all_inputs, parameters, feature_names, tagset):
    with open(out_filename, 'w', encoding='utf-8') as f:
        for inputs in all_inputs:
            input_len = len(inputs['tokens'])
            tag_seq = predict(inputs, input_len, parameters, feature_names, tagset)
            for i, tag in enumerate(tag_seq[1:-1]): 
                f.write(' '.join([inputs['tokens'][i+1], inputs['pos'][i+1], inputs['NP_chunk'][i+1], inputs['gold_tags'][i+1], tag])+'\n') # i + 1 because of <START>
            f.write('\n')


def evaluate(data, parameters, feature_names, tagset):
    all_gold_tags = [ ]
    all_predicted_tags = [ ]
    for inputs in tqdm(data):
        all_gold_tags.extend(inputs['gold_tags'][1:-1])  
        input_len = len(inputs['tokens'])
        all_predicted_tags.extend(predict(inputs, input_len, parameters, feature_names, tagset)[1:-1])
    return conllevaluate(all_gold_tags, all_predicted_tags)

def test_decoder():
    
    tagset = ['NN', 'VB']    

    def score_wrap(cur_tag, pre_tag, i):
        retval = score(cur_tag, pre_tag, i)
        print('Score('+cur_tag+','+pre_tag+','+str(i)+') returning '+str(retval))
        return retval

    def score(cur_tag, pre_tag, i):
        if i == 0:
            print("ERROR: Don't call score for i = 0 (that points to <START>, with nothing before it)")
        if i == 1:
            if pre_tag != '<START>':
                print("ERROR: Previous tag should be <START> for i = 1. Previous tag = "+pre_tag)
            if cur_tag == 'NN':
                return 6
            if cur_tag == 'VB':
                return 4
        if i == 2:
            if cur_tag == 'NN' and pre_tag == 'NN':
                return 4
            if cur_tag == 'NN' and pre_tag == 'VB':
                return 9
            if cur_tag == 'VB' and pre_tag == 'NN':
                return 5
            if cur_tag == 'VB' and pre_tag == 'VB':
                return 0
        if i == 3:
            if cur_tag != '<STOP>':
                print('ERROR: Current tag at i = 3 should be <STOP>. Current tag = '+cur_tag)
            if pre_tag == 'NN':
                return 1
            if pre_tag == 'VB':
                return 1

    predicted_tag_seq = decode(4, tagset, score_wrap)
    print('Predicted tag sequence should be = <START> VB NN <STOP>')
    print('Predicted tag sequence = '+' '.join(predicted_tag_seq))
    print("Score of ['<START>','VB','NN','<STOP>'] = "+str(compute_score(['<START>','VB','NN','<STOP>'], 4, score)))
    print('Max score should be = 14')
    print('Max score = '+str(compute_score(predicted_tag_seq, 4, score)))



In [None]:
def main_predict(data_filename, model_filename, use_four_features=False):
    """
    Main function to make predictions.
    Loads the model file and runs the NER tagger on the data, writing the output in CoNLL 2003 evaluation format to data_filename.out
    :param data_filename: String
    :param model_filename: String
    :return: None
    """
    data = read_data(data_filename)
    parameters = FeatureVector({})
    parameters.read_from_file(model_filename)

    tagset = ['B-PER', 'B-LOC', 'B-ORG', 'B-MISC', 'I-PER', 'I-LOC', 'I-ORG', 'I-MISC', 'O']

    feature_names = ['tag', 'prev_tag', 'current_word', 'curr_pos_tag', 'shape_curr_word', 'len_k', 'in_gazetteer', 'start_cap']

    write_predictions(os.path.join(directory, data_filename+'.out'), data, parameters, feature_names, tagset)
    evaluate(data, parameters, feature_names, tagset)

    return


def main_train():
    """
    Main function to train the model
    :return: None
    """
    print('Reading training data')
    train_data = read_data('ner.train')[:1100]
    
    tagset = ['B-PER', 'B-LOC', 'B-ORG', 'B-MISC', 'I-PER', 'I-LOC', 'I-ORG', 'I-MISC', 'O']
    
    feature_names = ['tag', 'prev_tag', 'current_word', 'curr_pos_tag', 'shape_curr_word', 'len_k', 'in_gazetteer', 'start_cap']

    
    print('Training...')
    parameters = train(train_data, feature_names, tagset, epochs=10)
    print('Training done')
    dev_data = read_data('ner.dev')[:1100]
    evaluate(dev_data, parameters, feature_names, tagset)
    test_data = read_data('ner.test')[:1100]
    
    evaluate(test_data, parameters, feature_names, tagset)
    parameters.write_to_file('model')

    return 



In [None]:
class Features(object):
    def __init__(self, inputs, feature_names):
        self.feature_names = feature_names
        self.inputs = inputs
        self.gazette_dict = {}

        with open('gazetteer.txt', 'r') as file:
            for row in file:
                words = row.split(' ')
                value = words[0]
                for w in words[1:]:
                    if (w in self.gazette_dict.keys()):
                        self.gazette_dict[w].append(value)
                    else:
                        self.gazette_dict[w] = [value]

    def compute_features(self, cur_tag, pre_tag, i):
        
        feats = FeatureVector({})
        curr_word = self.inputs['tokens'][i]
        len_curr_word = len(self.inputs['tokens'][i])
        
        if 'tag' in self.feature_names:
            feats.times_plus_equal(1, FeatureVector({'t='+cur_tag: 1}))
        if 'prev_tag' in self.feature_names:
            feats.times_plus_equal(1, FeatureVector({'ti='+cur_tag+"+ti-1="+pre_tag: 1}))
        if 'current_word' in self.feature_names:
            feats.times_plus_equal(1, FeatureVector({'t='+cur_tag+'+w='+self.inputs['tokens'][i]: 1}))

        # adding more features
        if 'curr_pos_tag' in self.feature_names:
            feats.times_plus_equal(1, FeatureVector({'t='+cur_tag+'+pi='+self.inputs['pos'][i]: 1}))
        
        if 'shape_curr_word' in self.feature_names:
            word_shape = ''.join(['a' if c.isalpha() else 'A' if c.isupper() else 'd' for c in curr_word])
            feats.times_plus_equal(1, FeatureVector({'t='+cur_tag+'si'+word_shape: 1}))


        if 'len_k' in self.feature_names:
            for j in range(1, min(5, len(curr_word) + 1)): 
                feats.times_plus_equal(1, FeatureVector({'t='+cur_tag+'+PRE'+str(j)+'='+curr_word[:j]: 1}))


       
        if 'in_gazetteer' in self.feature_names:
            if (curr_word) in self.gazette_dict.keys():
                if self.gazette_dict[curr_word] == cur_tag:
                    feats.times_plus_equal(1, FeatureVector({'t='+cur_tag+'+GAZ='+'True': 1}))
            

        if 'start_cap' in self.feature_names:
            if(curr_word[0].isupper()):
                feats.times_plus_equal(1, FeatureVector({'t='+cur_tag+'+CAP='+'True': 1}))
        
        return feats



In [7]:
class FeatureVector(object):

    def __init__(self, fdict):
        self.fdict = fdict

    def times_plus_equal(self, scalar, v2):
        """
        self += scalar * v2
        :param scalar: Double
        :param v2: FeatureVector
        :return: None
        """
        for key, value in v2.fdict.items():
            self.fdict[key] = scalar * value + self.fdict.get(key, 0)
        


    def power_plus(self, scalar, v2):
        """
        self += scalar * v2,
        :param scalar: Double
        :param v2: FeatureVector
        :return None
        """
        for key, value in v2.fdict.items():
            self.fdict[key] = pow(value, scalar) + self.fdict.get(key,0)

 
    

    def sqrt_div(self, v2, epsilon):
        """
        self += scalar * v2
        :param scalar: Double
        :param v2: FeatureVector
        :return: None
        """
        smoothing = 0.001

        for key, value in self.fdict.items():
            self.fdict[key] = (epsilon / (math.sqrt(v2.fdict[key]) + smoothing)) * self.fdict.get(key, 0)

        return self

    

    
    
    def dot_product(self, v2):
        """
        Computes the dot product between self and v2.  It is more efficient for v2 to be the smaller vector (fewer
        non-zero entries).
        :param v2: FeatureVector
        :return: Int
        """
        retval = 0
        for key, value in v2.fdict.items():
            retval += value * self.fdict.get(key, 0)
        return retval

        

    def write_to_file(self, filename):
        """
        Writes the feature vector to a file.
        :param filename: String
        :return: None
        """
        print('Writing to ' + filename)
        with open(filename, 'w', encoding='utf-8') as f:
            for key, value in self.fdict.items():
                f.write('{} {}\n'.format(key, value))


    def read_from_file(self, filename):
        """
        Reads a feature vector from a file.
        :param filename: String
        :return: None
        """
        self.fdict = {}
        with open(filename, 'r') as f:
            for line in f.readlines():
                txt = line.split()
                self.fdict[txt[0]] = float(txt[1])

#main_train()    # Uncomment to train a model (need to implement 'sgd' function)
#main_predict('ner.dev', 'model')  # Uncomment to predict on 'dev.ner' using the model 'model' (need to implement 'decode' function)


## Training using Adagrad (trained on 1100 training examples)

In [9]:
test_decoder()

Score(NN,<START>,1) returning 6
Score(VB,<START>,1) returning 4
Score(NN,NN,2) returning 4
Score(NN,VB,2) returning 9
Score(VB,NN,2) returning 5
Score(VB,VB,2) returning 0
Score(<STOP>,NN,3) returning 1
Score(<STOP>,VB,3) returning 1
Predicted tag sequence should be = <START> VB NN <STOP>
Predicted tag sequence = <START> VB NN <STOP>
Score of ['<START>','VB','NN','<STOP>'] = 14
Max score should be = 14
Max score = 14


In [10]:
main_train()

Reading training data
Training...
EPOCH 0


100%|██████████| 1100/1100 [01:51<00:00,  9.87it/s]
100%|██████████| 3466/3466 [03:09<00:00, 18.30it/s]


processed 51578 tokens with 5917 phrases; found: 8492 phrases; correct: 2357.
accuracy:  42.40%; (non-O)
accuracy:  82.61%; precision:  27.76%; recall:  39.83%; FB1:  32.72
              LOC: precision:  55.70%; recall:  55.03%; FB1:  55.36  1808
             MISC: precision:  41.56%; recall:  44.20%; FB1:  42.84  972
              ORG: precision:  46.32%; recall:  28.19%; FB1:  35.05  816
              PER: precision:  11.60%; recall:  31.00%; FB1:  16.88  4896
Writing to adagrad_results/model.iter0
32.71566382122285
EPOCH 1


100%|██████████| 1100/1100 [01:51<00:00,  9.84it/s]
100%|██████████| 3466/3466 [03:07<00:00, 18.47it/s]


processed 51578 tokens with 5917 phrases; found: 7969 phrases; correct: 2359.
accuracy:  44.32%; (non-O)
accuracy:  83.04%; precision:  29.60%; recall:  39.87%; FB1:  33.98
              LOC: precision:  62.37%; recall:  51.53%; FB1:  56.43  1512
             MISC: precision:  59.32%; recall:  36.21%; FB1:  44.97  558
              ORG: precision:  53.55%; recall:  24.76%; FB1:  33.86  620
              PER: precision:  14.26%; recall:  41.10%; FB1:  21.18  5279
Writing to adagrad_results/model.iter1
33.97666714676652
EPOCH 2


100%|██████████| 1100/1100 [01:51<00:00,  9.83it/s]
100%|██████████| 3466/3466 [03:08<00:00, 18.36it/s]


processed 51578 tokens with 5917 phrases; found: 7877 phrases; correct: 2256.
accuracy:  42.36%; (non-O)
accuracy:  82.71%; precision:  28.64%; recall:  38.13%; FB1:  32.71
              LOC: precision:  60.27%; recall:  50.49%; FB1:  54.95  1533
             MISC: precision:  59.96%; recall:  35.23%; FB1:  44.38  537
              ORG: precision:  49.69%; recall:  23.94%; FB1:  32.31  646
              PER: precision:  13.35%; recall:  37.61%; FB1:  19.71  5161
Writing to adagrad_results/model.iter2
32.70987385819922
EPOCH 3


100%|██████████| 1100/1100 [01:51<00:00,  9.86it/s]
100%|██████████| 3466/3466 [03:07<00:00, 18.44it/s]


processed 51578 tokens with 5917 phrases; found: 7609 phrases; correct: 2166.
accuracy:  39.94%; (non-O)
accuracy:  82.35%; precision:  28.47%; recall:  36.61%; FB1:  32.03
              LOC: precision:  63.17%; recall:  48.36%; FB1:  54.78  1401
             MISC: precision:  62.59%; recall:  36.43%; FB1:  46.06  532
              ORG: precision:  52.46%; recall:  23.04%; FB1:  32.02  589
              PER: precision:  12.56%; recall:  34.88%; FB1:  18.47  5087
Writing to adagrad_results/model.iter3
32.02720686086057
EPOCH 4


100%|██████████| 1100/1100 [01:50<00:00,  9.91it/s]
100%|██████████| 3466/3466 [03:08<00:00, 18.41it/s]


processed 51578 tokens with 5917 phrases; found: 7658 phrases; correct: 2136.
accuracy:  38.57%; (non-O)
accuracy:  82.11%; precision:  27.89%; recall:  36.10%; FB1:  31.47
              LOC: precision:  59.04%; recall:  51.04%; FB1:  54.75  1582
             MISC: precision:  56.04%; recall:  33.48%; FB1:  41.92  546
              ORG: precision:  60.49%; recall:  20.43%; FB1:  30.55  453
              PER: precision:  12.25%; recall:  33.95%; FB1:  18.01  5077
Writing to adagrad_results/model.iter4
31.46961325966851
EPOCH 5


100%|██████████| 1100/1100 [01:50<00:00,  9.99it/s]
100%|██████████| 3466/3466 [03:02<00:00, 18.98it/s]


processed 51578 tokens with 5917 phrases; found: 7596 phrases; correct: 2123.
accuracy:  38.59%; (non-O)
accuracy:  82.15%; precision:  27.95%; recall:  35.88%; FB1:  31.42
              LOC: precision:  61.58%; recall:  50.71%; FB1:  55.62  1507
             MISC: precision:  60.63%; recall:  33.70%; FB1:  43.32  508
              ORG: precision:  58.04%; recall:  22.07%; FB1:  31.98  510
              PER: precision:  11.65%; recall:  32.26%; FB1:  17.12  5071
Writing to adagrad_results/model.iter5
31.42159402057278
EPOCH 6


100%|██████████| 1100/1100 [01:49<00:00, 10.01it/s]
100%|██████████| 3466/3466 [03:03<00:00, 18.86it/s]


processed 51578 tokens with 5917 phrases; found: 7762 phrases; correct: 2094.
accuracy:  37.42%; (non-O)
accuracy:  81.93%; precision:  26.98%; recall:  35.39%; FB1:  30.62
              LOC: precision:  53.90%; recall:  51.75%; FB1:  52.80  1757
             MISC: precision:  59.47%; recall:  34.68%; FB1:  43.81  533
              ORG: precision:  59.60%; recall:  20.13%; FB1:  30.10  453
              PER: precision:  11.16%; recall:  30.57%; FB1:  16.35  5019
Writing to adagrad_results/model.iter6
30.61627311938008
EPOCH 7


100%|██████████| 1100/1100 [01:51<00:00,  9.89it/s]
100%|██████████| 3466/3466 [03:04<00:00, 18.81it/s]


processed 51578 tokens with 5917 phrases; found: 7630 phrases; correct: 2057.
accuracy:  36.58%; (non-O)
accuracy:  81.82%; precision:  26.96%; recall:  34.76%; FB1:  30.37
              LOC: precision:  56.79%; recall:  51.86%; FB1:  54.21  1671
             MISC: precision:  52.11%; recall:  32.49%; FB1:  40.03  570
              ORG: precision:  56.92%; recall:  21.77%; FB1:  31.50  513
              PER: precision:  10.64%; recall:  28.33%; FB1:  15.47  4876
Writing to adagrad_results/model.iter7
30.368347235550303
EPOCH 8


100%|██████████| 1100/1100 [01:48<00:00, 10.10it/s]
100%|██████████| 3466/3466 [03:02<00:00, 18.96it/s]


processed 51578 tokens with 5917 phrases; found: 7443 phrases; correct: 1986.
accuracy:  35.12%; (non-O)
accuracy:  81.60%; precision:  26.68%; recall:  33.56%; FB1:  29.73
              LOC: precision:  58.70%; recall:  49.40%; FB1:  53.65  1540
             MISC: precision:  56.55%; recall:  34.03%; FB1:  42.49  550
              ORG: precision:  57.58%; recall:  19.84%; FB1:  29.51  462
              PER: precision:  10.33%; recall:  27.57%; FB1:  15.02  4891
Writing to adagrad_results/model.iter8
29.730538922155684
EPOCH 9


100%|██████████| 1100/1100 [01:48<00:00, 10.18it/s]
100%|██████████| 3466/3466 [03:04<00:00, 18.82it/s]


processed 51578 tokens with 5917 phrases; found: 7488 phrases; correct: 1979.
accuracy:  35.07%; (non-O)
accuracy:  81.58%; precision:  26.43%; recall:  33.45%; FB1:  29.53
              LOC: precision:  57.29%; recall:  49.62%; FB1:  53.18  1585
             MISC: precision:  58.19%; recall:  33.04%; FB1:  42.15  519
              ORG: precision:  53.74%; recall:  20.88%; FB1:  30.08  521
              PER: precision:  10.06%; recall:  26.69%; FB1:  14.61  4863
Writing to adagrad_results/model.iter9
29.526296158149947
Training done


  0%|          | 0/1100 [00:00<?, ?it/s]


AttributeError: 'NoneType' object has no attribute 'dot_product'

In [11]:
main_predict('ner.dev', 'adagrad_results/model.iter9')

100%|██████████| 3466/3466 [03:12<00:00, 18.03it/s]

processed 51578 tokens with 5917 phrases; found: 7488 phrases; correct: 1979.
accuracy:  35.07%; (non-O)
accuracy:  81.58%; precision:  26.43%; recall:  33.45%; FB1:  29.53
              LOC: precision:  57.29%; recall:  49.62%; FB1:  53.18  1585
             MISC: precision:  58.19%; recall:  33.04%; FB1:  42.15  519
              ORG: precision:  53.74%; recall:  20.88%; FB1:  30.08  521
              PER: precision:  10.06%; recall:  26.69%; FB1:  14.61  4863





In [12]:
main_predict('ner.test', 'adagrad_results/model.iter9')

100%|██████████| 3684/3684 [03:21<00:00, 18.29it/s]

processed 46666 tokens with 5616 phrases; found: 7763 phrases; correct: 1711.
accuracy:  32.94%; (non-O)
accuracy:  79.03%; precision:  22.04%; recall:  30.47%; FB1:  25.58
              LOC: precision:  54.16%; recall:  51.62%; FB1:  52.86  1588
             MISC: precision:  40.44%; recall:  23.82%; FB1:  29.98  413
              ORG: precision:  46.77%; recall:  16.27%; FB1:  24.14  573
              PER: precision:   8.02%; recall:  25.97%; FB1:  12.25  5189





In [None]:
!cat "results/model.iter6" | awk '{print $2, $1}' | sort -gr > "results/model.sorted.txt"

The file `model.sorted.txt` will be viewable in your Google Drive folder.