In [None]:
import jsonlines
import pandas as pd
import regex as re
import numpy as np
import matplotlib.pyplot as plt
import csv

import pymorphy2
import nltk
from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow import keras

In [None]:
! pip -q install git+https://www.github.com/keras-team/keras-contrib.git sklearn-crfsuite
! pip install regex
! pip install keras==2.2.4
! pip install pymorphy2
! pip install load

In [None]:
BATCH_SIZE = 32
EPOCHS = 12
MAX_LEN = 75
EMBEDDING = 20

In [None]:
morph = pymorphy2.MorphAnalyzer()

In [None]:
reader = jsonlines.open("nerus_lenta.jsonl")

In [None]:
def getTag(word):
    if (sentense_to_tag.get('Sentence '+str(i)).get(word)):
        return sentense_to_tag.get('Sentence '+str(i)).get(word)
    else:
        return 'O'
with open('data.csv', mode='w') as csv_file:
    writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    with jsonlines.open("nerus_lenta.jsonl") as json:
        z=0
        i=1
        for item in json:
            word_map={}
            sentense_to_tag={}
            word_to_tag={}
            j=1
            list=[]
            for annotation in item['annotations']:
                word_map['word_'+ str(j)] = item['content'][int(annotation['span']['start']):int(annotation['span']['end'])]
                word_to_tag[annotation['text']]=annotation['type']
                j+=1
            sentense_to_tag['Sentence ' + str(i)] = word_to_tag

            item['content'] = re.sub("\s\s+",' ',re.sub('[0-9]',' ',re.sub('[,.—?!)(«»]','',item ['content'])))

            for mapped,word in word_map.items():
                item['content']=item['content'].replace(word,mapped,1)

            content_to_list = item['content'].split(" ")
    #         stop_words=stopwords.words('russian')
    #         content_to_list = [word for word in content_to_list if word not in stop_words]
            words =  [ word_map.get(x) if  word_map.get(x)!=None else x for x in content_to_list]

            content = ' '.join(words)
            for word in words:
                writer.writerow(['Sentence:'+str(i), word,morph.parse(word)[0].tag.POS ,getTag(word)])
            i+=1
            z+=1
            print(z)
            if (z>10000):
                break

In [None]:
df = pd.read_csv("data_set_200_hundred.csv",header=None,names=['Sentence','Word','Pos','Tag'])
df

In [None]:
words = [*set(df['Word'].values)]
tags = [*set(df['Tag'].values)]

n_words = len(words)
n_tags = len(tags)

In [None]:
class SentenceBuilder(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["Pos"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence:{}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None 

In [None]:
getter = SentenceBuilder(df)
sent = getter.get_next()
print('Sentence looks like:')
print(sent)

In [None]:
# Get all the sentences
sentences = getter.sentences

# Plot sentence by lenght
plt.hist([len(s) for s in sentences], bins=50)
plt.title('Token persentence')
plt.xlabel('Len (number of token)')
plt.ylabel('# samples')
plt.show()

In [None]:
# Vocabulary Key:word -> Value:token_index
# The first 2 entries are reserved for PAD and UNK
word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1 # Unknown words
word2idx["PAD"] = 0 # Padding

# Vocabulary Key:token_index -> Value:word
idx2word = {i: w for w, i in word2idx.items()}

# Vocabulary Key:Label/Tag -> Value:tag_index
# The first entry is reserved for PAD
tag2idx = {t: i+1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0

# Vocabulary Key:tag_index -> Value:Label/Tag
idx2tag = {i: w for w, i in tag2idx.items()}

In [None]:
from keras.preprocessing.sequence import pad_sequences
# Convert each sentence from list of Token to list of word_index
X = [[word2idx[w[0]] for w in s] for s in sentences]
# Padding each sentence to have the same lenght
X = pad_sequences(maxlen=MAX_LEN, sequences=X, padding="post", value=word2idx["PAD"])

# Convert Tag/Label to tag_index
y = [[tag2idx[w[2]] for w in s] for s in sentences]
# Padding each sentence to have the same lenght
y = pad_sequences(maxlen=MAX_LEN, sequences=y, padding="post", value=tag2idx["PAD"])

from keras.utils import to_categorical
# One-Hot encode
y = [to_categorical(i, num_classes=n_tags+1) for i in y]  # n_tags+1(PAD)

from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)
X_tr.shape, X_te.shape, np.array(y_tr).shape, np.array(y_te).shape

print('Raw Sample: ', ' '.join([w[0] for w in sentences[0]]))
print()
print('Raw Label: ', ' '.join([w[2] for w in sentences[0]]))
print()
print('After processing, sample:\n\n', X[0])
print()
print('After processing, labels:\n\n', y[0])

In [None]:
keras.backend.clear_session()

In [None]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF

# Model definition
input = Input(shape=(MAX_LEN,))
model = Embedding(input_dim=n_words+2, output_dim=EMBEDDING, # n_words + 2 (PAD & UNK)
                  input_length=MAX_LEN, mask_zero=True)(input)  # default: 20-dim embedding
model = Bidirectional(LSTM(units=50, return_sequences=True,
                           recurrent_dropout=0.1))(model)  # variational biLSTM
model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
crf = CRF(n_tags+1)  # CRF layer, n_tags+1(PAD)
out = crf(model)  # output

model = Model(input, out)
model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])

model.summary()

In [None]:
history = model.fit(np.array(X_tr), np.array(y_tr), batch_size=BATCH_SIZE, epochs=EPOCHS,validation_split=0.1, verbose=2)

In [None]:
pred_cat = model.predict(X_te)
pred = np.argmax(pred_cat, axis=-1)
y_te_true = np.argmax(y_te, -1)

In [None]:
from sklearn_crfsuite.metrics import flat_classification_report

# Convert the index to tag
pred_tag = [[idx2tag[i] for i in row] for row in pred]
y_te_true_tag = [[idx2tag[i] for i in row] for row in y_te_true] 

report = flat_classification_report(y_pred=pred_tag, y_true=y_te_true_tag)
print(report)

In [None]:
i = np.random.randint(0,X_te.shape[0]) # choose a random number between 0 and len(X_te)
p = model.predict(np.array([X_te[i]]))
p = np.argmax(p, axis=-1)

true = np.argmax(y_te[i], -1)

print("Sample number {} of {} (Test Set)".format(i, X_te.shape[0]))
# Visualization
print("{:15}||{:5}||{}".format("Word", "True", "Pred"))
print(30 * "=")
for w, t, pred in zip(X_te[i], true, p[0]):
    if w != 0:
        print("{:15}: {:5} {}".format(words[w-2], idx2tag[t], idx2tag[pred]))

In [None]:
from ipywidgets import interact_manual
from ipywidgets import widgets

import re
import string

# Custom Tokenizer
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()
    
def get_prediction(sentence):
    test_sentence = tokenize(sentence) # Tokenization
    # Preprocessing
    x_test_sent = pad_sequences(sequences=[[word2idx.get(w, 0) for w in test_sentence]],
                            padding="post", value=word2idx["PAD"], maxlen=MAX_LEN)
    # Evaluation
    p = model.predict(np.array([x_test_sent[0]]))
    p = np.argmax(p, axis=-1)
    # Visualization
    print("{:15}||{}".format("Word", "Prediction"))
    print(30 * "=")
    for w, pred in zip(test_sentence, p[0]):
        print("{:15}: {:5}".format(w, idx2tag[pred]))

interact_manual(get_prediction, sentence=widgets.Textarea(placeholder='Type your sentence here'));

In [None]:
import pickle

# Saving Vocab
pickle.dump(word2idx, open("models/word_to_index.pickle", "wb" ))
 
# Saving Vocab
pickle.dump(idx2tag, open("models/tag_to_index.pickle", "wb" ))
    
# Saving Model Weightword_to_index
model.save_weights('models/lstm_crf_weights.h5')

In [None]:
model_json = model.to_json()
with open("models/model.json","w") as json_file:
    json_file.write(model_json)