You need this part to run the scripts ** do not delete **

In [109]:
import nltk
from nltk.tokenize import word_tokenize
from sklearn.metrics import confusion_matrix
import scipy
import sklearn
import pycrfsuite
from sklearn.grid_search import RandomizedSearchCV
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics 
from features import *
from evaluate import *

from itertools import chain
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import RandomizedSearchCV

# Task 1 Build a NER model (1.5 marks) <a id='Task1'></a>

* Building upon the tutorial [code](./Named_Entity_Extraction.ipynb), modify the code in the file  [features](./features.py) to achieve F1-score of more than 0.8 for categories B-PER, I-PER, B-LOC, I-LOC, B-ORG, I-ORG.
* The dataset you need to use for training is **esp.train ** and **esp.testb** for testing from the CoNLL 2002 sets.
* You can use the code provided in [tutorial sheet](./Named_Entity_Extraction.ipynb) 


In [116]:
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))

In [127]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]
X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [119]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.06,
    c2=0.3,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)
y_pred = crf.predict(X_test)
print(bio_classification_report(y_test, y_pred))

             precision    recall  f1-score   support

      B-LOC       0.81      0.68      0.74      1084
      I-LOC       0.77      0.54      0.64       325
     B-MISC       0.65      0.33      0.44       339
     I-MISC       0.62      0.41      0.49       557
      B-ORG       0.79      0.74      0.77      1400
      I-ORG       0.77      0.64      0.70      1104
      B-PER       0.86      0.81      0.84       735
      I-PER       0.90      0.92      0.91       634

avg / total       0.79      0.68      0.72      6178



In [87]:
labels = ['B-LOC', 'B-ORG', 'B-PER', 'I-PER', 'I-ORG', 'I-LOC']
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

Fitting 3 folds for each of 50 candidates, totalling 150 fits


KeyboardInterrupt: 

In [84]:
trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)
trainer.set_params({
    'c1': 0.06,
    'c2': 0.3,
    'max_iterations': 100,
    'feature.possible_transitions': True
})
trainer.train('ner-esp2.model')
tagger = pycrfsuite.Tagger()
tagger.open('ner-esp2.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

             precision    recall  f1-score   support

      B-LOC       0.81      0.68      0.74      1084
      I-LOC       0.77      0.54      0.64       325
     B-MISC       0.65      0.33      0.44       339
     I-MISC       0.62      0.41      0.49       557
      B-ORG       0.79      0.74      0.77      1400
      I-ORG       0.77      0.64      0.70      1104
      B-PER       0.86      0.81      0.84       735
      I-PER       0.90      0.92      0.91       634

avg / total       0.79      0.68      0.72      6178



# Task 2.1 Understanding transitions between labels (0.5 mark)

* Write code to get  all transitions from B-PER in top 10 tranistions for your model

In [83]:
from collections import Counter
def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))
print("Top 10 transitions from B-PER:")
print_transitions(list((key,value) for (key, value) in Counter(crf.transition_features_).most_common(10) if key[0] == 'B-PER'))

Top 10 transitions from B-PER:
B-PER  -> I-PER   5.403514


# Task 2.2  Understanding the impact of positive features (0.5 mark)

* Write code to get the top 10 state features for B-PER

In [85]:
print_state_features(Counter(dict((key,value) for key, value in crf.state_features_.items() if key[1] == 'B-LOC')).most_common(10))

5.557285 B-LOC    word.lower():líbano
4.802440 B-LOC    word.lower():asunción
4.359403 B-LOC    word.lower():santander
4.287330 B-LOC    word.lower():bruselas
4.134757 B-LOC    word.lower():marruecos
4.124686 B-LOC    word.lower():londres
4.112152 B-LOC    word.lower():cáceres
4.103841 B-LOC    word.lower():asturias
4.061166 B-LOC    word.lower():murcia
3.921835 B-LOC    word.lower():balcanes


# Task 2.3  Understanding the impact of negative features (0.5 mark)

* Write code to get the bottom 10 state features for B-PER



In [77]:
print_state_features(Counter(dict((key,value) for key, value in crf.state_features_.items() if key[1] == 'B-PER')).most_common()[-10:])

-1.231117 B-PER    postag:DA
-1.231117 B-PER    postag[:2]:DA
-1.240402 B-PER    word[-3:]:dad
-1.305316 B-PER    word[-2:]:ón
-1.437030 B-PER    word[-3:]:ico
-1.454148 B-PER    -1:word.lower():en
-1.456027 B-PER    word[-2:]:os
-1.684242 B-PER    -1:word.lower():al
-1.707408 B-PER    word[-3:]:nes
-2.886880 B-PER    -1:word.lower():del


# Task 3 Using your NER model (2 marks)

* Write a code to read a file (each line has one sentence, for development purpose you can use this [file](./spansih.txt), and tag it with POS-tags (hint use nltk POS-tagger) so that you can run entity extraction on it.
* Use model developed in [Task1](#Task1) to extract entities from [file](./spansih.txt) 
* Print the extracted NER entities in [file](./spansih.txt) with the formating as shown below:


    Total Entities : 7  
    Person: Lucas Digne  
    Location: Barcelona, la Ramba  
    Organisation: RAC1, Real Madrid  
    Misc: Plaza Cataluña, Supercopa de España

Points to note 
* you need to combine B-PER with its sucessive I-PERs to get the entites composed by more than one token. Similarly for B-ORG, B-MISC and B-LOC.
* the list of entities within a cetegory should be alphabetically sorted.
* you do not need to extract all entities in the text, the excercise is to demonstrate how to use the NER model with new data.


In [148]:
lines = []
sents = []
per = []
loc = []
org = []
misc = []
with open('spanish.txt') as f:
    for line in f:
        text = word_tokenize(line)
        lines.append(text)
        pos = nltk.pos_tag(text)
        sents.append(pos)
    new_test = [sent2features(s) for s in sents]
    res = crf.predict(new_test)
    for i in range(len(lines)):
        for j in range(len(lines[i])):
            string = lines[i][j]
            temp = j+1
            if res[i][j] == 'B-LOC':
                while True:
                    if temp > len(lines[i]) - 1:
                        break
                    if res[i][temp] != 'I-LOC':
                        break
                    string += ' ' + lines[i][temp]
                    temp += 1
                loc.append(string)
            if res[i][j] == 'B-PER':
                while True:
                    if temp > len(lines[i]) - 1:
                        break
                    if res[i][temp] != 'I-PER':
                        break
                    string += ' ' + lines[i][temp]
                    temp += 1
                per.append(string)
            if res[i][j] == 'B-ORG':
                while True:
                    if temp > len(lines[i]) - 1:
                        break
                    if res[i][temp] != 'I-ORG':
                        break
                    string += ' ' + lines[i][temp]
                    temp += 1
                org.append(string)
            if res[i][j] == 'B-MISC':
                while True:
                    if temp > len(lines[i]) - 1:
                        break
                    if res[i][temp] != 'I-ORG':
                        break
                    string += ' ' + lines[i][temp]
                    temp += 1
                misc.append(string)
    print('Total Entities: ', len(per)+len(loc)+len(misc)+len(org))
    print('Person:', ', '.join(per))
    print('Location: ', ', '.join(loc))
    print('Organisation: ', ', '.join(org))
    print('Misc: ', ', '.join(misc))

Total Entities:  3
Person: 
Location:  Plaza Cataluña
Organisation:  Real Madrid
Misc:  Supercopa
