## Importing libraries

In [88]:
import matplotlib.pyplot as plt
import nltk
from nltk.tag import pos_tag
from sklearn_crfsuite import CRF, metrics
from sklearn.metrics import confusion_matrix
import warnings
import pandas as pd
from seqeval.metrics import classification_report, f1_score
from seqeval.metrics import accuracy_score, precision_score, recall_score
import datasets
warnings.filterwarnings('ignore')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\саня\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Loading the data

In [89]:
def extract_sentences_and_labels(data):
    sentences, labels = [], []
    for item in data:
        words = item['tokens']
        tags = item['ner_tags']
        sentences.append(words)
        labels.append(tags)
    return sentences, labels

In [90]:
def convert_labels(labels):
    label_map = {0: 'O', 1: 'B-MISC', 2: 'I-MISC', 3: 'B-PER', 4: 'I-PER', 5: 'B-ORG', 6: 'I-ORG', 7: 'B-LOC', 8: 'I-LOC'}
    return [[label_map[tag] for tag in sent] for sent in labels]

In [91]:
data = datasets.load_dataset('conll2003')
train_path = data['train']
test_path = data['test']
val_path = data['validation']
train_path[0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [92]:
train_sentences, train_labels = extract_sentences_and_labels(train_path)
train_labels = convert_labels(train_labels)

test_sentences, test_labels = extract_sentences_and_labels(test_path)
test_labels = convert_labels(test_labels)

val_sentences, val_labels = extract_sentences_and_labels(val_path)
val_labels = convert_labels(val_labels)

In [93]:
train_labels[0]

['B-PER', 'O', 'B-LOC', 'O', 'O', 'O', 'B-LOC', 'O', 'O']

## Extracting features

In [94]:
def sent2feats(sentence):
    feats = []
    sen_tags = pos_tag(sentence)
    for i in range(0, len(sentence)):
        word = sentence[i]
        word_feats = {'word': word}
        if i == 0:
            word_feats['prevWord'] = word_feats['prevSecondWord'] = "<S>"
        elif i == 1:
            word_feats['prevWord'] = sentence[0]
            word_feats['prevSecondWord'] = '</S>'
        else:
            word_feats['prevWord'] = sentence[i-1]
            word_feats['prevSecondWord'] = sentence[i-2]
        if i == len(sentence)-2:
            word_feats['nextWord'] = sentence[i+1]
            word_feats['nextNextWord'] = '</S>'
        elif i==len(sentence)-1:
            word_feats['nextWord'] = '</S>'
            word_feats['nextNextWord'] = '</S>'
        else:
            word_feats['nextWord'] = sentence[i+1]
            word_feats['nextNextWord'] = sentence[i+2]
        word_feats['tag'] = sen_tags[i][1]
        if i == 0:
            word_feats["prevTag"] = word_feats["prevSecondTag"] = "<S>"
        elif i == 1:
            word_feats["prevTag"] = sen_tags[0][1]
            word_feats["prevSecondTag"] = "</S>"
        else:
            word_feats["prevTag"] = sen_tags[i - 1][1]
            word_feats["prevSecondTag"] = sen_tags[i - 2][1]
        if i == len(sentence) - 2:
            word_feats["nextTag"] = sen_tags[i + 1][1]
            word_feats["nextNextTag"] = "</S>"
        elif i == len(sentence) - 1:
            word_feats["nextTag"] = "</S>"
            word_feats["nextNextTag"] = "</S>"
        else:
            word_feats["nextTag"] = sen_tags[i + 1][1]
            word_feats["nextNextTag"] = sen_tags[i + 2][1]
        feats.append(word_feats)
    return feats

In [95]:
def get_feats_conll(sentences, labels):
    feats = [sent2feats(sentence) for sentence in sentences]
    return feats, labels

In [96]:
def prepare_df(test_data, y_pred):
    data = []
    for item, pred in zip(test_data, y_pred):
        text = item[0]
        gt = item[1]
        data.append({'text': text, 'ground_truth': gt, 'pred': pred})
    df = pd.DataFrame(data)
    return df


### Training a model

In [97]:
def print_cm(cm, labels):
    print("\n")
    column_width = max([len(x) for x in labels] + [5])  # 5 is value length
    empty_cell = " " * column_width
    print("    " + empty_cell, end=" ")
    for label in labels:
        print("%{0}s".format(column_width) % label, end=" ")
    print()
    # Print rows
    for i, label1 in enumerate(labels):
        print("    %{0}s".format(column_width) % label1, end=" ")
        sum = 0
        for j in range(len(labels)):
            cell = "%{0}.0f".format(column_width) % cm[i, j]
            sum = sum + int(cell)
            print(cell, end=" ")
        print(sum)

In [98]:
def get_confusion_matrix(y_true,y_pred,labels):
    trues, preds = [], []
    for yseq_true, yseq_pred in zip(y_true, y_pred):
        trues.extend(yseq_true)
        preds.extend(yseq_pred)
    print_cm(confusion_matrix(trues,preds,labels=labels),labels)

In [99]:
def train_seq(X_train, Y_train, X_dev, Y_dev, raw_test_data):
    crf = CRF(algorithm='lbfgs', c1=0.1, c2=10, max_iterations=50)
    crf.fit(X_train, Y_train)
    y_pred = crf.predict(X_dev)
    print(f"F1 Score: {f1_score(Y_dev, y_pred)}")
    print(f"Precision: {precision_score(Y_dev, y_pred)}")
    print(f"Recall: {recall_score(Y_dev, y_pred)}")
    print(f"Accuracy: {accuracy_score(Y_dev, y_pred)}")
    print(classification_report(Y_dev, y_pred))
    
    df = prepare_df(raw_test_data, y_pred)
    return crf, df

## Visualize results

In [100]:
X_train, Y_train = get_feats_conll(train_sentences, train_labels)
X_dev, Y_dev = get_feats_conll(test_sentences, test_labels)
crf, df = train_seq(X_train, Y_train, X_dev, Y_dev, list(zip(test_sentences, test_labels)))
print('Done with sequence model')

F1 Score: 0.6655570966250578
Precision: 0.6965357073737178
Recall: 0.6372167138810199
Accuracy: 0.9278992139549909
              precision    recall  f1-score   support

         LOC       0.61      0.29      0.39       702
        MISC       0.77      0.76      0.77      1617
         ORG       0.69      0.76      0.72      1668
         PER       0.65      0.54      0.59      1661

   micro avg       0.70      0.64      0.67      5648
   macro avg       0.68      0.59      0.62      5648
weighted avg       0.69      0.64      0.65      5648

Done with sequence model
