In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# !pip install patool

In [3]:
# import patoolib
# patoolib.extract_archive("/content/drive/MyDrive/Dataset/NER_Dataset.rar", 
#                          outdir="/content/drive/MyDrive/Dataset")

In [4]:
def conll_sentences(conll_file):
    sent = []
    pos = []
    chunk = []
    entity = []
    temp_sent = []
    temp_pos = []
    temp_chunk = []
    temp_entity = []
    
    with open(conll_file) as f:
        conll_raw_data = f.readlines()
    conll_raw_data = [x.strip() for x in conll_raw_data]

    for line in conll_raw_data:
        if line != '':
            split_line = line.split()
            if len(split_line) == 4:
                if split_line[0] != '-DOCSTART-':
                    temp_sent.append(split_line[0])
                    temp_pos.append(split_line[1])
                    temp_chunk.append(split_line[2])
                    
                    # Rename entity values as PER, LOC, ORG, MISC, O
                    old_ent = split_line[3]
                    if old_ent in ('I-ORG', 'B-ORG'):
                        new_ent = 'ORG'
                    elif old_ent in ('I-LOC', 'B-LOC'):
                        new_ent = 'LOC'
                    elif old_ent in ('I-MISC', 'B-MISC'):
                        new_ent = 'MISC'
                    elif old_ent in ('I-PER', 'B-PER'):
                        new_ent = 'PER'
                    else:
                        new_ent = 'O'
                    temp_entity.append(new_ent)
            else:
                raise IndexError('Line split length does not equal 4.')
        else:
            if len(temp_sent) > 0:
                assert(len(sent) == len(pos))
                assert(len(sent) == len(chunk))
                assert(len(sent) == len(entity))
                sent.append(temp_sent)
                pos.append(temp_pos)
                chunk.append(temp_chunk)
                entity.append(temp_entity)
                temp_sent = []
                temp_pos = []
                temp_chunk = []
                temp_entity = []
    
    return sent, pos, chunk, entity

In [5]:
def conll_words(conll_file):
    all_words = []
    all_pos = []
    all_chunk = []
    all_entities = []
    
    sent, pos, chunk, entity = conll_sentences(conll_file)

    for se in sent:
        for w in se:
            all_words.append(w)
    for po in pos:
        for p in po:
            all_pos.append(p)
    for ch in chunk:
        for c in ch:
            all_chunk.append(c)
    for en in entity:
        for e in en:
            all_entities.append(e)
            
    return all_words, all_pos, all_chunk, all_entities

### CoNLL dataset

In [6]:
train_file = r"/content/drive/MyDrive/Dataset/NER_Dataset/CoNLL2003/eng.train"
testa_file = r"/content/drive/MyDrive/Dataset/NER_Dataset/CoNLL2003/eng.testa"
testb_file = r"/content/drive/MyDrive/Dataset/NER_Dataset/CoNLL2003/eng.testb"
testc_file = r"/content/drive/MyDrive/Dataset/NER_Dataset/CoNLL2003/eng.testc"

train_words, _, _, train_entities = conll_words(train_file)
testa_words, _, _, testa_entities = conll_words(testa_file)
testb_words, _, _, testb_entities = conll_words(testb_file)
testc_words, _, _, testc_entities = conll_words(testc_file)

test_words = testa_words + testb_words + testc_words
test_entities = testa_entities + testb_entities + testc_entities

In [7]:
combined_words = train_words + testa_words + testb_words + testc_words
entity_set = set(train_entities + testa_entities + testb_entities + testc_entities)

word_set = set()

for line in combined_words:
    for word in line:
        word_set.add(word)

In [8]:
entity_set

{'LOC', 'MISC', 'O', 'ORG', 'PER'}

In [18]:
import nltk

iterations = 2

trainer = nltk.tag.hmm.HiddenMarkovModelTrainer(states=entity_set, symbols=word_set)

model = trainer.train_unsupervised(train_words, max_iterations=iterations)


iteration 0 logprob -5763881.357550944
iteration 1 logprob -4430261.852424948


In [27]:
test_result = model.tag(test_words)

In [28]:
test_result[0:10]

[('CRICKET', 'ORG'),
 ('-', 'ORG'),
 ('LEICESTERSHIRE', 'ORG'),
 ('TAKE', 'ORG'),
 ('OVER', 'ORG'),
 ('AT', 'ORG'),
 ('TOP', 'ORG'),
 ('AFTER', 'ORG'),
 ('INNINGS', 'ORG'),
 ('VICTORY', 'ORG')]

In [29]:
test_predicted = []

for word, entity in test_result:
  test_predicted.append(entity)

In [30]:
def accuracy (expected, predicted):
    total = 0
    correct = 0
    for i in range(len(expected)):
        total += 1
        if (expected[i] == predicted[i]):
            correct += 1
    print('accuracy = %d / %d = %lf' % (correct, total, correct/total))

In [31]:
accuracy(test_entities, test_predicted)


accuracy = 4590 / 97832 = 0.046917


In [32]:
from sklearn.metrics import classification_report

print(classification_report(test_entities, test_predicted))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         LOC       0.00      0.00      0.00      4019
        MISC       0.00      0.00      0.00      2188
           O       0.00      0.00      0.00     81111
         ORG       0.05      1.00      0.09      4590
         PER       0.00      0.00      0.00      5924

    accuracy                           0.05     97832
   macro avg       0.01      0.20      0.02     97832
weighted avg       0.00      0.05      0.00     97832



  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
def entity_count (expected):

    n_org = 0
    n_per = 0
    n_loc = 0
    n_misc = 0
    n_o = 0
    
    for e in expected:
        if e == 'ORG':
            n_org = n_org + 1
        elif e == 'PER':
            n_per = n_per + 1
        elif e == 'LOC':
            n_loc = n_loc + 1
        elif e == 'MISC':
            n_misc = n_misc + 1
        elif e == 'O':
            n_o = n_o + 1
    
    print('ORG:', n_org)
    print('PER:', n_per)
    print('LOC:', n_loc)
    print('MISC:', n_misc)
    print('O:', n_o)

In [34]:
entity_count(test_entities)


ORG: 4590
PER: 5924
LOC: 4019
MISC: 2188
O: 81111
