In [1]:
import nltk
import numpy as np
import pandas as pd

In [2]:
!pip3 install sklearn-crfsuite

Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting tabulate
  Downloading tabulate-0.8.7-py3-none-any.whl (24 kB)
Collecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.7-cp38-cp38-macosx_10_13_x86_64.whl (186 kB)
[K     |████████████████████████████████| 186 kB 3.0 MB/s eta 0:00:01
[?25hInstalling collected packages: tabulate, python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.7 sklearn-crfsuite-0.3.6 tabulate-0.8.7


In [3]:
nltk.download('conll2000')
train_sents = list(nltk.corpus.conll2000.iob_sents('train.txt'))
test_sents = list(nltk.corpus.conll2000.iob_sents('test.txt'))

[nltk_data] Downloading package conll2000 to
[nltk_data]     /Users/manojbhadu/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!


In [16]:
def token2features(sent, i):
    token = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'intercept': 1,
        'suffix3': token[-3:],
        'suffix3': token[-2:],
        'prefix2' : token[:2],
        'prefix3' : token[:3],
        'wordlen' : len(token),
        'token.UPPER': token.isupper(),
        'token.TITLE': token.istitle(),
        'token.DIGIT': token.isdigit(),
        'postag': postag,
                
    }
    if i > 0:
        pre_token = sent[i-1][0]
        pre_postag = sent[i-1][1]
        features.update({
            'pre_token.TITLE': pre_token.istitle(),
            'pre_token.UPPER': pre_token.isupper(),
            'pre_postag': pre_postag,
        })
    else:
        features['beginning'] = True
        
    if i < len(sent)-1:
        post_token = sent[i+1][0]
        post_postag = sent[i+1][1]
        features.update({
            'post_token.TITLE': post_token.istitle(),
            'post_token.UPPER': post_token.isupper(),
            'post_postag': post_postag,
        })
        
    else:
        features['end'] = True         
    return features


def sent2features(sent):
    return [token2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label[0] for token, postag, label in sent]

In [17]:
X_train = [sent2features(s) for s in train_sents]
Y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
Y_test = [sent2labels(s) for s in test_sents]

In [18]:
X_train[0]

[{'intercept': 1,
  'suffix3': 'ce',
  'prefix2': 'Co',
  'prefix3': 'Con',
  'wordlen': 10,
  'token.UPPER': False,
  'token.TITLE': True,
  'token.DIGIT': False,
  'postag': 'NN',
  'beginning': True,
  'post_token.TITLE': False,
  'post_token.UPPER': False,
  'post_postag': 'IN'},
 {'intercept': 1,
  'suffix3': 'in',
  'prefix2': 'in',
  'prefix3': 'in',
  'wordlen': 2,
  'token.UPPER': False,
  'token.TITLE': False,
  'token.DIGIT': False,
  'postag': 'IN',
  'pre_token.TITLE': True,
  'pre_token.UPPER': False,
  'pre_postag': 'NN',
  'post_token.TITLE': False,
  'post_token.UPPER': False,
  'post_postag': 'DT'},
 {'intercept': 1,
  'suffix3': 'he',
  'prefix2': 'th',
  'prefix3': 'the',
  'wordlen': 3,
  'token.UPPER': False,
  'token.TITLE': False,
  'token.DIGIT': False,
  'postag': 'DT',
  'pre_token.TITLE': False,
  'pre_token.UPPER': False,
  'pre_postag': 'IN',
  'post_token.TITLE': False,
  'post_token.UPPER': False,
  'post_postag': 'NN'},
 {'intercept': 1,
  'suffix3': 'n

In [7]:
import sklearn_crfsuite
crf = sklearn_crfsuite.CRF()
crf.fit(X_train, Y_train)



CRF(keep_tempfiles=None)

In [8]:
labels = list(crf.classes_)

In [9]:
from sklearn import metrics

In [10]:
X_train[0]

[{'intercept': 1,
  'suffix3': 'ce',
  'prefix2': 'Co',
  'prefix3': 'Con',
  'wordlen': 10,
  'token.UPPER': False,
  'token.TITLE': True,
  'token.DIGIT': False,
  'postag': 'NN',
  'beginning': True,
  'post_token.TITLE': False,
  'post_token.UPPER': False,
  'post_postag': 'IN'},
 {'intercept': 1,
  'suffix3': 'in',
  'prefix2': 'in',
  'prefix3': 'in',
  'wordlen': 2,
  'token.UPPER': False,
  'token.TITLE': False,
  'token.DIGIT': False,
  'postag': 'IN',
  'pre_token.TITLE': True,
  'pre_token.UPPER': False,
  'pre_postag': 'NN',
  'post_token.TITLE': False,
  'post_token.UPPER': False,
  'post_postag': 'DT'},
 {'intercept': 1,
  'suffix3': 'he',
  'prefix2': 'th',
  'prefix3': 'the',
  'wordlen': 3,
  'token.UPPER': False,
  'token.TITLE': False,
  'token.DIGIT': False,
  'postag': 'DT',
  'pre_token.TITLE': False,
  'pre_token.UPPER': False,
  'pre_postag': 'IN',
  'post_token.TITLE': False,
  'post_token.UPPER': False,
  'post_postag': 'NN'},
 {'intercept': 1,
  'suffix3': 'n

In [11]:
labels

['B', 'I', 'O']

In [12]:
from sklearn_crfsuite import metrics
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    Y_test, y_pred, labels=labels, digits=3))

              precision    recall  f1-score   support

           B      0.945     0.943     0.944     23852
           I      0.915     0.915     0.915     17345
           O      0.947     0.953     0.950      6180

    accuracy                          0.934     47377
   macro avg      0.936     0.937     0.936     47377
weighted avg      0.934     0.934     0.934     47377





In [13]:
y_pred_flat = np.hstack(y_pred)

In [14]:
Y_test_flat = np.hstack(Y_test)

In [15]:
Y_test_flat

array(['B', 'I', 'I', ..., 'B', 'I', 'O'], dtype='<U1')

In [None]:
labels

['B', 'I', 'O']

In [None]:
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(Y_test_flat, y_pred_flat,labels=labels)
print('Confusion Matrix\n')
print(confusion)


Confusion Matrix

[[22505  1277    70]
 [ 1218 15867   260]
 [   99   192  5889]]


In [None]:
pd_confusion = pd.DataFrame((confusion), columns=list(labels),index=list(labels))

In [None]:
pd_confusion

Unnamed: 0,B,I,O
B,22505,1277,70
I,1218,15867,260
O,99,192,5889
