In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
def conll_sentences(conll_file):
    sent = []
    pos = []
    chunk = []
    entity = []
    temp_sent = []
    temp_pos = []
    temp_chunk = []
    temp_entity = []
    
    with open(conll_file) as f:
        conll_raw_data = f.readlines()
    conll_raw_data = [x.strip() for x in conll_raw_data]

    for line in conll_raw_data:
        if line != '':
            split_line = line.split()
            if len(split_line) == 4:
                if split_line[0] != '-DOCSTART-':
                    temp_sent.append(split_line[0])
                    temp_pos.append(split_line[1])
                    temp_chunk.append(split_line[2])
                    
                    # Rename entity values as PER, LOC, ORG, MISC, O
                    old_ent = split_line[3]
                    if old_ent in ('I-ORG', 'B-ORG'):
                        new_ent = 'ORG'
                    elif old_ent in ('I-LOC', 'B-LOC'):
                        new_ent = 'LOC'
                    elif old_ent in ('I-MISC', 'B-MISC'):
                        new_ent = 'MISC'
                    elif old_ent in ('I-PER', 'B-PER'):
                        new_ent = 'PER'
                    else:
                        new_ent = 'O'
                    temp_entity.append(new_ent)
            else:
                raise IndexError('Line split length does not equal 4.')
        else:
            if len(temp_sent) > 0:
                assert(len(sent) == len(pos))
                assert(len(sent) == len(chunk))
                assert(len(sent) == len(entity))
                sent.append(temp_sent)
                pos.append(temp_pos)
                chunk.append(temp_chunk)
                entity.append(temp_entity)
                temp_sent = []
                temp_pos = []
                temp_chunk = []
                temp_entity = []
    
    return sent, pos, chunk, entity

### Get CoNLL Features

In [3]:
def get_conll_features(index, sentence, pos, chunk):
  """Function used to extract features for the CoNLL dataset
    
    'w' represents word feature
    't' represents POS tag feature
    'c' represents chunk tag feature
    '-n' represents previous 'n' feature
    '+n' represents posterior 'n' feature
  """

  features = {}
  last_index = len(sentence) - 1
  word = sentence[index]
  word_lc = word.lower()

  # features from current word:

  features["w"] = word
  features["t"] = pos[index]
  features["length"] = len(word)
  features["upperase"] = any(x.isupper() for x in word)
  features["firstletter"] = word[0].isupper() and (len(word)>1)
  features["hasdigits"] = any(x.isdigit() for x in word)
  features["c"] = chunk[index]
  features["log_flag"] = ("field" in word_lc) or ("land" in word_lc) or ("burgh" in word_lc) or ("shire" in word_lc)
  features["hasdot"] = ("." in word and len(word) > 1)
  features["endsinns"] = (len(word)>1 and word_lc[-2:]=='ns')


  # features from previous 2 words
  if index == 0: #first word in sentence
    features["t-2 t-1"] = "<B> <B>"
    features["t-1"] = "<B>"
    features["w-2"] = "<B>"
    features["w-1"] = "<B>"
    features["c-2 c-1"] = "<B> <B>"
    features["c-1"] = "<B>"

  elif index == 1: #second word in sentence
    features["t-2 t-1"] = "<B> "+pos[0]
    features["t-1"] = pos[0]
    features["w-2"] = "<B>"
    features["w-1"] = sentence[0]
    features["c-2 c-1"] = "<B> "+chunk[0]
    features["c-1"] = chunk[0]
  else:
    features["t-2 t-1"] = pos[index-2] + ' ' + pos[index-1]
    features["t-1"] = pos[index-1]
    features["w-2"] = sentence[index-2]
    features["w-1"] = sentence[index-1]
    features["c-2 c-1"] = chunk[index-2] + ' ' + chunk[index-1]
    features["c-1"] = chunk[index-1]

  # features from posterior 2 words
  if index == last_index: #last word in sentence
    features["t+1 t+2"] = "<E> <E>"
    features["t+1"] = "<E>"
    features["w+2"] = "<E>"
    features["w+1"] = "<E>"

  elif index == last_index - 1: #second last word in sentence
    features["t+1 t+2"] = pos[last_index] + "<E>"
    features["t+1"] = pos[last_index]
    features["w+2"] = "<E>"
    features["w+1"] = sentence[last_index]
  else:
    features["t+1 t+2"] = pos[index+1] + ' ' + pos[index+2]
    features["t+1"] = pos[index+1]
    features["w+2"] = sentence[index+2]
    features["w+1"] = sentence[index+1]

  return features


In [4]:
train_file = r"/content/drive/MyDrive/Dataset/NER_Dataset/CoNLL2003/eng.train"

train_sent, train_pos, train_chunk, train_entity = conll_sentences(train_file)


In [5]:
train_data = []
for sent, pos, chunk, entity in zip(train_sent, train_pos, train_chunk, 
                                    train_entity):
  if len(sent) != len(pos) or len(pos) != len(chunk) or len(chunk) != len(entity):
    raise ValueError("ERROR: CoNLL train length miss match")
  
  for i, ent in enumerate(entity):
    labelled_data = (get_conll_features(i, sent, pos, chunk), ent)
    train_data.append(labelled_data)


### Train Conditional Markov Model NLTK classifier on CoNLL train dataset

In [6]:
!wget http://hal3.name/megam/megam_i686.opt.gz

!gunzip /content/megam_i686.opt.gz 

!chmod ugo+rx /content/megam_i686.opt

--2022-07-29 18:24:30--  http://hal3.name/megam/megam_i686.opt.gz
Resolving hal3.name (hal3.name)... 64.98.135.72
Connecting to hal3.name (hal3.name)|64.98.135.72|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: http://users.umiacs.umd.edu/~hal/megam/megam_i686.opt.gz [following]
--2022-07-29 18:24:30--  http://users.umiacs.umd.edu/~hal/megam/megam_i686.opt.gz
Resolving users.umiacs.umd.edu (users.umiacs.umd.edu)... 128.8.120.33
Connecting to users.umiacs.umd.edu (users.umiacs.umd.edu)|128.8.120.33|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 333329 (326K) [application/x-gzip]
Saving to: ‘megam_i686.opt.gz’


2022-07-29 18:24:30 (3.36 MB/s) - ‘megam_i686.opt.gz’ saved [333329/333329]



In [7]:
import nltk
import os
from nltk import MaxentClassifier

megam_path = os.path.expanduser("/content/megam_i686.opt")
nltk.config_megam(megam_path)
maxentClf = MaxentClassifier.train(train_data, algorithm='MEGAM')

In [8]:
testa_file = r"/content/drive/MyDrive/Dataset/NER_Dataset/CoNLL2003/eng.testa"
testb_file = r"/content/drive/MyDrive/Dataset/NER_Dataset/CoNLL2003/eng.testb"
testc_file = r"/content/drive/MyDrive/Dataset/NER_Dataset/CoNLL2003/eng.testc"

testa_sent, testa_pos, testa_chunk, testa_entity = conll_sentences(testa_file)
testb_sent, testb_pos, testb_chunk, testb_entity = conll_sentences(testb_file)
testc_sent, testc_pos, testc_chunk, testc_entity = conll_sentences(testc_file)



test_sent = testa_sent + testb_sent + testc_sent
test_pos = testa_pos + testb_pos + testc_pos
test_chunk = testa_chunk + testb_chunk + testc_chunk
test_entity = testa_entity + testb_entity + testc_entity


In [9]:
def get_entity_true_pred(test_sent, test_pos, test_chunk, test_entity):
  test_truth = []
  test_pred = []

  for sent, pos, chunk, entity in zip(test_sent, test_pos, test_chunk, test_entity):
    if len(sent) != len(pos) or len(pos) != len(chunk) or len(chunk) != len(entity):
      raise ValueError("ERROR: CoNLL test lentgh miss match")

    for i, ent in enumerate(entity):
      test_truth.append(ent)
      pred = maxentClf.classify(get_conll_features(i, sent, pos, chunk))
      test_pred.append(pred)

  return test_truth, test_pred

In [10]:
test_truth, test_pred = get_entity_true_pred(test_sent, test_pos, test_chunk, test_entity)


In [11]:
def accuracy (expected, predicted):
    total = 0
    correct = 0
    for i in range(len(expected)):
        total += 1
        if (expected[i] == predicted[i]):
            correct += 1
    print('accuracy = %d / %d = %lf' % (correct, total, correct/total))

In [12]:
accuracy(test_truth, test_pred)


accuracy = 91110 / 97832 = 0.931290


In [13]:
from sklearn.metrics import classification_report

print(classification_report(test_truth, test_pred))

              precision    recall  f1-score   support

         LOC       0.66      0.67      0.66      4019
        MISC       0.70      0.51      0.59      2188
           O       0.98      0.98      0.98     81111
         ORG       0.64      0.57      0.60      4590
         PER       0.74      0.81      0.77      5924

    accuracy                           0.93     97832
   macro avg       0.74      0.71      0.72     97832
weighted avg       0.93      0.93      0.93     97832



In [14]:
testa_truth, testa_pred = get_entity_true_pred(testa_sent, testa_pos, testa_chunk, testa_entity)

print(accuracy(testa_truth, testa_pred))

print(classification_report(testa_truth, testa_pred))

accuracy = 48065 / 51362 = 0.935809
None
              precision    recall  f1-score   support

         LOC       0.66      0.67      0.67      2094
        MISC       0.73      0.49      0.59      1268
           O       0.98      0.99      0.98     42759
         ORG       0.63      0.56      0.59      2092
         PER       0.76      0.82      0.79      3149

    accuracy                           0.94     51362
   macro avg       0.75      0.71      0.72     51362
weighted avg       0.93      0.94      0.93     51362



In [15]:
testb_truth, testb_pred = get_entity_true_pred(testb_sent, testb_pos, testb_chunk, testb_entity)

print(accuracy(testb_truth, testb_pred))

print(classification_report(testb_truth, testb_pred))

accuracy = 43013 / 46435 = 0.926306
None
              precision    recall  f1-score   support

         LOC       0.65      0.67      0.66      1925
        MISC       0.67      0.54      0.60       918
           O       0.98      0.98      0.98     38323
         ORG       0.64      0.58      0.61      2496
         PER       0.71      0.80      0.75      2773

    accuracy                           0.93     46435
   macro avg       0.73      0.71      0.72     46435
weighted avg       0.93      0.93      0.93     46435



In [16]:
testc_truth, testc_pred = get_entity_true_pred(testc_sent, testc_pos, testc_chunk, testc_entity)

print(accuracy(testc_truth, testc_pred))

print(classification_report(testc_truth, testc_pred))

accuracy = 32 / 35 = 0.914286
None
              precision    recall  f1-score   support

         LOC       0.00      0.00      0.00         0
        MISC       0.00      0.00      0.00         2
           O       1.00      1.00      1.00        29
         ORG       0.50      0.50      0.50         2
         PER       1.00      1.00      1.00         2

    accuracy                           0.91        35
   macro avg       0.50      0.50      0.50        35
weighted avg       0.91      0.91      0.91        35



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
index = 1

#testa_sent[index], testa_pos[index], testa_chunk[index], testa_entity[index]

y_hat = []
sent, pos, chunk, entity = testa_sent[index], testa_pos[index], testa_chunk[index], testa_entity[index]

for i, w in enumerate(sent):
  pred = maxentClf.classify(get_conll_features(i, sent, pos, chunk))
  print("{} --> {}".format(entity[i], pred))
  y_hat.append(pred)

LOC --> LOC
O --> O


In [None]:

def entity_count (expected):

    n_org = 0
    n_per = 0
    n_loc = 0
    n_misc = 0
    n_o = 0
    
    for e in expected:
        if e == 'ORG':
            n_org = n_org + 1
        elif e == 'PER':
            n_per = n_per + 1
        elif e == 'LOC':
            n_loc = n_loc + 1
        elif e == 'MISC':
            n_misc = n_misc + 1
        elif e == 'O':
            n_o = n_o + 1
    
    print('ORG:', n_org)
    print('PER:', n_per)
    print('LOC:', n_loc)
    print('MISC:', n_misc)
    print('O:', n_o)

In [None]:
entity_count(test_truth)


ORG: 4590
PER: 5924
LOC: 4019
MISC: 2188
O: 81111


### NEEL 2006 Dataset

In [None]:
def get_neel_features(index, sentence):
  """Function used to extract features for the NEEL dataset
    
    'w' represents word feature
    '-n' represents previous 'n' feature
    '+n' represents posterior 'n' feature
  """


  features = {}
  last_index = len(sentence) - 1
  word = sentence[index]
  word_lc = word.lower()

  # features from current word:

  features["w"] = word
  features["length"] = len(word)
  features["upperase"] = any(x.isupper() for x in word)
  features["firstletter"] = word[0].isupper() and (len(word)>1)
  features["hasdigits"] = any(x.isdigit() for x in word)
  features["log_flag"] = ("field" in word_lc) or ("land" in word_lc) or ("burgh" in word_lc) or ("shire" in word_lc)
  features["hasdot"] = ("." in word and len(word) > 1)
  features["endsinns"] = (len(word)>1 and word_lc[-2:]=='ns')


  # features from previous 2 words
  if index == 0: #first word in sentence
    features["w-2"] = "<B>"
    features["w-1"] = "<B>"

  elif index == 1: #second word in sentence
    features["w-2"] = "<B>"
    features["w-1"] = sentence[0]

  else:
    features["w-2"] = sentence[index-2]
    features["w-1"] = sentence[index-1]

  # features from posterior 2 words
  if index == last_index: #last word in sentence
    features["w+2"] = "<E>"
    features["w+1"] = "<E>"

  elif index == last_index - 1: #second last word in sentence
    features["w+2"] = "<E>"
    features["w+1"] = sentence[last_index]
  else:
    features["w+2"] = sentence[index+2]
    features["w+1"] = sentence[index+1]

  return features


In [None]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import pandas as pd

def neel_sentences(gs_file, tsv_file):
    """NEEL2006 sentences from gs and tsv files
    
    Seperating NEEL data into individual sentences with corresponding tags
    
    arguments: gs_file, tsv_file
    returns: sentences, entities, unknown tweet IDs
    """
    
    gs_col_names=['tweet_id','start','end','uri', 'confidence', 'entity']
    tsv_col_names=['tweet_id','text']
    tweets_dict = {}
    data_dict = {}
    seen_ids = set()
    sent = []
    entity = []
    unknown_indicies = set()
    
    gs_df = pd.read_table(gs_file, sep = '\t', header=None, names=gs_col_names)
    # fixes entity label at index 4805 that is incorrect
    if len(gs_df['entity']) > 4805 and gs_df['entity'][4805] == 'Organization373937812812615000':
        gs_df.at[4805, 'entity'] = 'Organization'
    
    tsv_df = pd.read_table(tsv_file, sep = ',', header=None, names=tsv_col_names)
    # strip '|' character from the edges of tsv_df column values
    tsv_df['tweet_id'] = tsv_df['tweet_id'].apply(lambda x: str(x).strip('|'))
    tsv_df['text'] = tsv_df['text'].apply(lambda x: str(x).strip('|'))

    for index, row in tsv_df.iterrows():
        tweets_dict[row['tweet_id']] = row['text']
    
    for index, row in gs_df.iterrows():
        tweet_id = str(row['tweet_id'])
        start = row['start']
        end = row['end']
        old_ent = row['entity']
        
        # Rename entity values as PER, LOC, ORG, MISC, O
        if old_ent in ('Character', 'Person'):
            new_ent = 'PER'
        elif old_ent == 'Location':
            new_ent = 'LOC'
        elif old_ent == 'Organization':
            new_ent = 'ORG'
        else:
            new_ent = 'MISC'
        
        try:
            text = tweets_dict[tweet_id]
            if tweet_id not in seen_ids:
                seen_ids.add(tweet_id)
                words = word_tokenize(text)
                labels = ['O']*len(words)
            else:
                words = data_dict[tweet_id]['words']
                labels = data_dict[tweet_id]['labels']
            assert(len(words)==len(labels))
            ent_words = word_tokenize(text[start:end])
            for e in ent_words:
                for i in range(len(words)):
                    if e == words[i]:
                        labels[i] = new_ent
            data_dict[tweet_id] = {'words': words, 'labels': labels}
        except KeyError:
            unknown_indicies.add(tweet_id)
    
    for key in data_dict:
        sent.append(data_dict[key]['words'])
        entity.append(data_dict[key]['labels'])
    
    return sent, entity, unknown_indicies

In [None]:
train_gs_file = "/content/drive/MyDrive/Dataset/NER_Dataset/NEEL2006/training_neel.gs"
train_tsv_file = "/content/drive/MyDrive/Dataset/NER_Dataset/NEEL2006/training.tsv"

n_train_sent, n_train_ent, n_train_err = neel_sentences(train_gs_file, train_tsv_file)

train_data = []
for sent, entity in zip(n_train_sent, n_train_ent):
  if len(sent) != len(entity):
    raise ValueError("ERROR: NEEL train length miss match")
  for i, ent in enumerate(entity):
    labelled_data = (get_neel_features(i, sent), ent)
    train_data.append(labelled_data)

In [None]:
import nltk
import os
from nltk import MaxentClassifier

megam_path = os.path.expanduser("/content/megam_i686.opt")
nltk.config_megam(megam_path)
maxentClf = MaxentClassifier.train(train_data, algorithm='MEGAM')

In [None]:
test_gs_file = "/content/drive/MyDrive/Dataset/NER_Dataset/NEEL2006/test_neel.gs"
test_tsv_file = "/content/drive/MyDrive/Dataset/NER_Dataset/NEEL2006/test.tsv"

test_sent, test_ent, test_err = neel_sentences(test_gs_file, test_tsv_file)

test_truth = []
test_pred = []

for sent, entity in zip(test_sent, test_ent):
  if len(sent) != len(entity):
    raise ValueError("ERROR: Neel test length miss match")

  for i, ent in enumerate(entity):
    test_truth.append(ent)
    pred = maxentClf.classify(get_neel_features(i, sent))
    test_pred.append(pred)


In [None]:
accuracy(test_truth, test_pred)

accuracy = 4678 / 5410 = 0.864695


In [None]:
print(classification_report(test_truth, test_pred))

              precision    recall  f1-score   support

         LOC       0.69      0.24      0.36        37
        MISC       0.50      0.50      0.50       484
           O       0.91      0.99      0.95      4363
         ORG       0.36      0.03      0.05       146
         PER       0.62      0.32      0.42       380

    accuracy                           0.86      5410
   macro avg       0.62      0.41      0.46      5410
weighted avg       0.84      0.86      0.84      5410



In [None]:
entity_count(test_truth)

ORG: 146
PER: 380
LOC: 37
MISC: 484
O: 4363
