In [1]:
import re

import pandas as pd
import torch

from collections import defaultdict

from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel


In [2]:
ENTITIES = ['PERSON',
 'PROFESSION',
 'ORGANIZATION',
 'EVENT',
 'DATE',
 'COUNTRY',
 'CITY',
 'NUMBER',
 'AGE',
 'ORDINAL',
 'NATIONALITY',
 'FACILITY',
 'STATE_OR_PROVINCE',
 'LAW',
 'AWARD',
 'LOCATION',
 'IDEOLOGY',
 'WORK_OF_ART',
 'PRODUCT',
 'CRIME',
 'DISEASE',
 'TIME',
 'MONEY',
 'DISTRICT',
 'PENALTY',
 'RELIGION',
 'PERCENT',
 'LANGUAGE',
 'FAMILY']

In [3]:
train_json = pd.read_json("../data/public_dat/train.jsonl", lines=True)
train_json

Unnamed: 0,ners,sentences,id
0,"[[0, 5, CITY], [16, 23, PERSON], [34, 41, PERS...",Бостон взорвали Тамерлан и Джохар Царнаевы из ...,0
1,"[[21, 28, PROFESSION], [53, 67, ORGANIZATION],...",Умер избитый до комы гитарист и сооснователь г...,1
2,"[[0, 4, PERSON], [37, 42, COUNTRY], [47, 76, O...",Путин подписал распоряжение о выходе России из...,2
3,"[[0, 11, PERSON], [36, 47, PROFESSION], [49, 6...",Бенедикт XVI носил кардиостимулятор\nПапа Римс...,3
4,"[[0, 4, PERSON], [17, 29, ORGANIZATION], [48, ...",Обама назначит в Верховный суд латиноамериканк...,4
...,...,...,...
514,"[[42, 46, COUNTRY], [82, 87, COUNTRY], [104, 1...",Глава Малайзии: мы не хотим противостоять Кита...,514
515,"[[1, 4, PRODUCT], [31, 33, FACILITY], [35, 44,...",«Союз» впервые пристыковался к МКС за 6 часов\...,515
516,"[[0, 4, PERSON], [8, 12, PERSON], [45, 52, AGE...",Трамп и Путин сделали совместное заявление к 7...,516
517,"[[0, 9, NATIONALITY], [58, 72, PERSON], [101, ...",Российский магнат устроил самую дорогую свадьб...,517


In [4]:
train, test = train_test_split(train_json, test_size=0.2)
train.shape, test.shape

((415, 3), (104, 3))

In [5]:
def tokenize(sentence: str) -> list[tuple[str, int, int]]:
    matches = [(match.start(), match.end()) for match in re.finditer(r'\w+', sentence)]
    matches = list(map(lambda x: [sentence[x[0]:x[1]], *x], matches))
    return matches

stemer = SnowballStemmer("russian") 

def normalise(tokens: list[tuple[str, int, int]]) -> list[tuple[str, int, int]]:
    for i in range(len(tokens)):
        tokens[i][0] = stemer.stem(tokens[i][0])
    return tokens
print(tokenize("Главы Малайзии: мы не хотим противостоять Ки"))
normalise(tokenize("Главы Малайзии: мы не хотим противостоять Ки"))

[['Главы', 0, 5], ['Малайзии', 6, 14], ['мы', 16, 18], ['не', 19, 21], ['хотим', 22, 27], ['противостоять', 28, 41], ['Ки', 42, 44]]


[['глав', 0, 5],
 ['малайз', 6, 14],
 ['мы', 16, 18],
 ['не', 19, 21],
 ['хот', 22, 27],
 ['противостоя', 28, 41],
 ['ки', 42, 44]]

In [6]:
def word2features(sent, i):
    word = sent[i][0]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

In [7]:

# train_data = defaultdict(lambda: [0 for _ in range(len(ENTITIES))])
X = []
Y = []
for _, row in train.iterrows():
    tokens = tokenize(row.sentences)
    tokens = normalise(tokens)
    labels = []
    for i, token in enumerate(tokens):
        token_cat = "o"
        for (l_idx, r_idx, cat) in row.ners:
            if token[1] >= l_idx and token[2] <= r_idx:
                token_cat = cat
                # print(cat , l_idx, r_idx, "-->", token)
                break
        labels.append(token_cat)
    
    tokens = list(map(lambda x: x[0], tokens))
    tokens = sent2features(tokens)
    X.append(tokens)
    Y.append(labels)
    


In [8]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1734)

# # Train CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=500,
    all_possible_transitions=True
)
crf.fit(x_train, y_train)

In [9]:
# evaluation
from sklearn.metrics import f1_score

labels = list(crf.classes_)

y_train_pred = crf.predict(x_train)
print("train f1 flat score -->", metrics.flat_f1_score(y_train, y_train_pred, average='macro', labels=labels))

y_pred = crf.predict(x_test)
print("train f1 flat score -->", metrics.flat_f1_score(y_test, y_pred, average='macro', labels=labels))


train f1 flat score --> 0.20799690374059787
train f1 flat score --> 0.19605272169727198


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


In [10]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=500,
    all_possible_transitions=True
)
crf.fit(X, Y)

In [11]:
submission_json = pd.read_json("../data/public_dat/dev.jsonl", lines=True)
submission_json.head()

Unnamed: 0,senences,id
0,Генерал Д.Петреус назначен на пост главы ЦРУ.\...,519
1,Подозреваемые в нападении на Charlie Hebdo зах...,520
2,Скончалась Джанет Рено — первая женщина-генпро...,521
3,Школьник из Иванова получил «Золотой крест» за...,522
4,Врачи установили причину смерти Сергея Доренко...,523


In [12]:
submission_json.shape

(65, 2)

In [13]:
sub_x = []
submission_tokens = []
for i, (sent, id) in submission_json.iterrows():
    cur_tokens = normalise(tokenize(sent))
    submission_tokens.append(cur_tokens)
    cur_tokens = list(map(lambda x: x[0], cur_tokens))
    cur_tokens = sent2features(cur_tokens)
    sub_x.append(cur_tokens)

In [14]:
result = crf.predict(sub_x)

In [15]:
submission_tokens[0][0]

['генера', 0, 7]

In [16]:
import json

with open("test.jsonl", "w") as f:
    
    for i in range(len(result)):
        obj = {"ners": []}
        for j in range(len(submission_tokens[i])):
            if result[i][j] == "o":
                continue
            obj["ners"].append([submission_tokens[i][j][1], submission_tokens[i][j][2], result[i][j]])
        obj["id"] = int(submission_json.iloc[i].id)
        f.write(json.dumps(obj) + "\n")

In [17]:
len(result), len(sub_x)

(65, 65)