In [228]:
import os
from pathlib import Path
from collections import Counter
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import pandas as pd
import pymorphy2

In [218]:
def calc_metrics(y_test, pred, proba=None, labels=None, print_=True, mode="weighted"):
    output = {}
    if proba is not None:
        roc_auc = metrics.roc_auc_score(y_test, proba)
        output["AUC"] = roc_auc
    output["Recall"] = metrics.recall_score(y_test, pred, average=mode)
    output["Precision"] = metrics.precision_score(y_test, pred, average=mode)
    output["F1"] = metrics.f1_score(y_test, pred, average=mode)
    output["accuracy"] = metrics.accuracy_score(y_test, pred)
    if labels is not None:
        index = labels
        columns = ["pred_" + el for el in index]
    else:
        columns = None
        index = None
    output["conf_matrix"] = pd.DataFrame(metrics.confusion_matrix(y_test, pred, labels=labels), 
                                         columns=columns, index=index)
    report = metrics.classification_report(y_true=y_test, y_pred=pred, labels=labels)
    if print_:
        for key, value in output.items():
            if "matrix" in key:
                print(value)
            else:
                print(f"{key}: {value:0.3f}")
        print(report)
    return output, report

#### Read train / test

In [45]:
PATH ="/home/karimlulu/repos/ner-uk/"

# Read tokens and positions of tokens from a file

def read_tokens(filename):
    tokens = []
    pos = 0
    with open(filename, "r") as f:
        text = f.read().split("\n")
        for line in text:
            if len(line) == 0:
                pos += 1
            else:
                for token in line.split(" "):
                    tokens.append((token, pos, pos + len(token)))
                    pos += len(token) + 1
    return tokens

# Read annotations and positions of annotations from a file

def read_annotations(filename):
    anno = []
    with open(filename, "r") as f:
        for line in f.readlines():
            annotations = line.split()
            anno.append((int(annotations[2]), int(annotations[3]), annotations[1]))
    return anno

def extract_labels(anno, tokens):
    labels = []
    ann_id = 0
    for token in tokens:
        if ann_id < len(anno):
            beg, end, label = anno[ann_id]
            if token[1] < beg:
                labels.append("--")
            # if token[1] == beg or (token[1] > beg and token[1] < end)
            else:
                labels.append(label)
                if token[2] == end:
                    ann_id += 1
        else:
            labels.append("--")    
    return labels

# tokens = read_tokens(PATH + "data/A_alumni.krok.edu.ua_Prokopenko_Vidrodzhennia_velotreku(5).tok.txt")
# anno = read_annotations(PATH + "data/A_alumni.krok.edu.ua_Prokopenko_Vidrodzhennia_velotreku(5).tok.ann")
# labels = extract_labels(anno, tokens)

# for i, j in zip(tokens, labels):
#     print(i[0], j)

# Extract list of files for training and testing

dev_test = {"dev": [], "test": []}
category = ""
with open(PATH + "doc/dev-test-split.txt", "r") as f:
    for line in f.readlines():
        line = line.strip()
        if line in ["DEV", "TEST"]:
            category = line.lower()
        elif len(line) == 0:
            continue
        else:
            dev_test[category].append(line)

print(len(dev_test["dev"]), len(dev_test["test"]))

# Get train and test data and labels

train_tokens, test_tokens, train_labels, test_labels = [], [], [], []

for filename in dev_test["dev"]:
    try:
        tokens = read_tokens(PATH + "data/" + filename + ".txt")
        train_tokens += tokens
        train_labels += extract_labels(read_annotations(PATH + "data/" + filename + ".ann"), tokens)
    except:
        pass

for filename in dev_test["test"]:
    try:
        tokens = read_tokens(PATH + "data/" + filename + ".txt")
        test_tokens += tokens
        test_labels += extract_labels(read_annotations(PATH + "data/" + filename + ".ann"), tokens)
    except:
        pass


156 73


In [64]:
def normalize_counter(counter):
    return  [(key, value*100/sum(counter.values())) for key, value in counter.items()]

In [65]:
c_train = Counter(train_labels)
c_test = Counter(test_labels)
normalize_counter(c_train), normalize_counter(c_test)

([('--', 90.2701838973454),
  ('ОРГ', 0.6002542743801194),
  ('ЛОК', 1.2922140629016459),
  ('РІЗН', 1.4005933068869452),
  ('ПЕРС', 6.436754458485886)],
 [('--', 88.25913820418766),
  ('ЛОК', 2.1195210025262976),
  ('ОРГ', 3.1229036724091177),
  ('ПЕРС', 5.706292907811541),
  ('РІЗН', 0.792144213065384)])

In [72]:
train_tokens[:30]

[('На', 0, 2),
 ('довірливих', 3, 13),
 ('кіровоградців', 14, 27),
 ('полюють', 28, 35),
 ('шахраї', 36, 42),
 ('та', 43, 45),
 ('фірми-посередники', 46, 63),
 (',', 64, 65),
 ('які', 66, 69),
 ('за', 70, 72),
 ('1000', 73, 77),
 ('грн', 78, 81),
 ('.', 82, 83),
 ('готові', 84, 90),
 ('«', 91, 92),
 ('виготовити', 93, 103),
 ('»', 104, 105),
 ('біометричний', 106, 118),
 ('паспорт', 119, 126),
 (',', 127, 128),
 ('який', 129, 133),
 ('коштує', 134, 140),
 ('518', 141, 144),
 ('грн', 145, 148),
 ('.', 149, 150),
 ('Із', 152, 154),
 ('запровадженням', 155, 169),
 ('біометричних', 170, 182),
 ('паспортів', 183, 192),
 ('активізувалися', 193, 207)]

#### Features

In [230]:
QUOTES = ['»', '«', '"', "'"]
morph = pymorphy2.MorphAnalyzer()

In [255]:
def word2features(record, l_context, r_context):
    word = record[0]
    p = morph.parse(word)
    pos = p[0].tag.POS
    normal_form = p[0].normal_form
    l_words = [el[0] for el in l_context]
    r_words = [el[0] for el in r_context]
    features = {
        'word.lower()': normal_form.lower() or word.lower(),
        'wort.pos': pos or "NONE",
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'word.isquote()': word in QUOTES
    }
    if l_context:
        for k,w in enumerate(l_words):
            prefix = f"word-{k+1}"
            p = morph.parse(w)
            pos = p[0].tag.POS
            normal_form = p[0].normal_form
            features.update({prefix+".lower()": normal_form or w.lower(),
                             prefix+".pos": pos or "NONE",
                             prefix+".isupper()": w.isupper(),
                             prefix+".istitle()": w.istitle(),
                             prefix+".isdigit()": w.isdigit(),
                             prefix+".isquote()": w in QUOTES})
    else:
        features["BOS"] = True
    if r_context:
        for k,w in enumerate(r_words):
            p = morph.parse(w)
            pos = p[0].tag.POS
            normal_form = p[0].normal_form
            prefix = f"word+{k+1}"
            features.update({prefix+".lower()": normal_form or w.lower(),
                             prefix+".pos": pos or "NONE",
                             prefix+".isupper()": w.isupper(),
                             prefix+".istitle()": w.istitle(),
                             prefix+".isdigit()": w.isdigit(),
                             prefix+".isquote()": w in QUOTES})
    else:
        features["EOS"] = True
    return features

In [186]:
def build_features(data, window=2, exclude=["."]):
    output = []
    for i,record in enumerate(data):
        l_context = []
        r_context = []
        k = 1
        l_end = False
        r_end = False
        while k <= window:
            if i+k>len(data)-1 or data[i+k] in exclude:
                r_end = True
            if i-k<0 or data[i-k] in exclude:
                l_end = True
            if not r_end:
                r_context.append(data[i+k])
            if not l_end:
                l_context.append(data[i-k])
            k += 1
        features = word2features(record, l_context, r_context)
        output.append(features)
    return output

In [256]:
window = 3
train = build_features(train_tokens, window=window)
test = build_features(test_tokens, window=window)

In [257]:
v = DictVectorizer(sparse=True)
train_v = v.fit_transform(train)
test_v = v.transform(test)

In [261]:
clf = LogisticRegression(class_weight="balanced", C=1e-1)

In [262]:
clf.fit(train_v, train_labels)

LogisticRegression(C=0.1, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [263]:
y_pred = clf.predict(test_v)
labels = clf.classes_
#labels.remove("--")
output, report = calc_metrics(pred=y_pred, y_test=test_labels, labels=labels)

Recall: 0.896
Precision: 0.871
F1: 0.876
accuracy: 0.896
      pred_--  pred_ЛОК  pred_ОРГ  pred_ПЕРС  pred_РІЗН
--      60792        72       162        507        304
ЛОК       973       300        18        176         18
ОРГ      1636       130       239         83        100
ПЕРС     2448       212        40       1279         19
РІЗН      291        43        38         46        137
             precision    recall  f1-score   support

         --       0.92      0.98      0.95     61837
        ЛОК       0.40      0.20      0.27      1485
        ОРГ       0.48      0.11      0.18      2188
       ПЕРС       0.61      0.32      0.42      3998
       РІЗН       0.24      0.25      0.24       555

avg / total       0.87      0.90      0.88     70063

