Подключимся к гугл-диску, чтобы получить доступ к датасету:

In [9]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Установим нужные нам версии библиотек:



In [0]:
!pip install git+https://www.github.com/keras-team/keras-contrib.git
!pip install keras==2.2.2
!pip install tensorflow==1.15.0
!pip install keras_applications==1.0.7









Задаем пути 

In [0]:
time_dir = '/content/gdrive/My Drive/Mezentseva Zavarzina/experiments_dl_model/'
# имя папки, где будут результаты оценки и часть имени с весами
path_to_exp = '120_5_3_150_lastest'
result_dir = '/content/gdrive/My Drive/Mezentseva Zavarzina/experiments_dl_model/test_ml/'

In [10]:
import csv

import os
import zipfile
import os
import numpy as np
from tqdm import tqdm
import tensorflow as tf
import keras
from keras import backend as K
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Input
from keras.layers.merge import add, concatenate
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Lambda
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras_contrib.layers import CRF

ModuleNotFoundError: ignored

Распакуем архив с данными:

In [0]:
if 'train' in os.listdir(time_dir):
  print('Files are already extracted')
else:
  with zipfile.ZipFile(os.path.join(time_dir, 'train.zip'), 'r') as zip_ref:
      zip_ref.extractall(os.path.join(time_dir, 'train'))

print(len(os.listdir(os.path.join(time_dir, 'train'))))

Files are already extracted
256


Импортируем нужные нам библиотеки:

In [5]:
print(tf.__version__)
print(keras.__version__)

1.15.0
2.2.2


Определим класс для загрузки наших данных:

In [0]:
class DatasetLoader:

    def __init__(self, path_to_data_dir):
        self._path_to_data_dir = path_to_data_dir
        self._data_files = sorted(os.listdir(self._path_to_data_dir))


    def load_dataset(self):
        sentences = []
        for data_file in tqdm(self._data_files, desc='Loading data'):
            if not data_file.endswith('.csv'):
              continue
            with open(os.path.join(self._path_to_data_dir, data_file), 'r') as data_f:
                reader = csv.DictReader(data_f)
                sentence = []
                for row in reader:
                    sentence.append((row['token'], row['tag']))
                sentences.append(sentence)
        return sentences


Определим класс для векторизации данных:

In [0]:
class Vectorizer:

    def __init__(self):
        self._max_sentence_len = 1000
        self._max_wordform_len = 30
        self._all_chars = u'qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM,.?!:;"«»-—1234567890'
        self._punct = u',.?!:;"«»-—'
        self._letters = u'qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM'
        self._numbers = u'1234567890'
        #self._len_all_chars = len(self._all_chars)
        self._len_all_chars = len(self._letters) + 2
        
        self._tag2id = {'O': 0, 'B-TIME': 1, 'I-TIME': 2, 'B-DATE': 3, 'I-DATE': 4, 'B-DURATION': 5, 'I-DURATION': 6,
                        'B-SET': 7, 'I-SET': 8}
    
    def get_len_all_chars(self):
      return self._len_all_chars

    def vectorize_chars(self, sentences):
        X_chars = self.vectorize_chars_dataset(sentences)
        return X_chars

    def vectorize_targets(self, sentences):
        y = [[self._tag2id[w[1]] for w in s] for s in sentences]
        y = pad_sequences(maxlen=self._max_sentence_len, sequences=y, padding="post", value=self._tag2id["O"])
        y = y.reshape(y.shape[0], y.shape[1], 1)
        return y

    def vectorize_chars_dataset(self, sentences):
        X_chars = [[self._vectorize_chars_wordform(w[0]) for w in s] for s in sentences]
        X_chars = pad_sequences(maxlen=self._max_sentence_len, sequences=X_chars, padding="post",
                                value=np.zeros((self._max_wordform_len, self._len_all_chars), dtype=np.int32))
        return X_chars

    def _vectorize_chars_wordform(self, wordform):
        vector = np.zeros(self._max_wordform_len * self._len_all_chars)
        for i in range(len(wordform)):
            if i == self._max_wordform_len:
                break
            """
            if wordform[i] in self._all_chars:
                ind = self._all_chars.index(wordform[i])
                vector[i * self._len_all_chars + ind] = 1.0
            """
            # try to make 3 classes
            if wordform[i] in self._punct:
              ind = len(self._letters) + 1
              vector[i * self._len_all_chars + ind] = 1.0              
            if wordform[i] in self._letters:
              ind = self._letters.index(wordform[i])
              vector[i * self._len_all_chars + ind] = 1.0
            if wordform[i] in self._numbers:  
              ind = len(self._letters) + 2
              vector[i * self._len_all_chars + ind] = 1.0
        vector = vector.reshape((self._max_wordform_len, self._len_all_chars))
        return vector

Определим класс для обучения модели:

In [0]:
class Trainer:

    def __init__(self):
        data_dir = os.path.join(time_dir, 'train')
        self._data_loader = DatasetLoader(data_dir)
        self._vectorizer = Vectorizer()

        self._sentences = self._data_loader.load_dataset()
        self._val_sentences = self._sentences[-50:]
        self._train_samples = self._sentences[:200]

        self._batch_size = 5
        self._max_len = 1000
        self.steps_per_epoch = len(self._train_samples)/self._batch_size

        self._model = self._define_model()

    def _define_model(self):
        input_chars = Input(shape=(self._max_len, 30, self._vectorizer.get_len_all_chars()))
        chars = TimeDistributed(Bidirectional(LSTM(units=150,
                                                   recurrent_dropout=0.5)))(input_chars)

        crf = CRF(9, sparse_target=True)  # CRF layer
        out = crf(chars)  # output

        model = Model([input_chars], out)
        model.summary()

        model.compile(optimizer="adam", loss=crf.loss_function, metrics=[crf.accuracy])
        return model

    def _generate_train_samples(self):
      i = 0
      while True:
        X_chars = self._vectorizer.vectorize_chars(self._train_samples[i:i + self._batch_size])
        y = self._vectorizer.vectorize_targets(self._train_samples[i:i + self._batch_size])
        i += self._batch_size
        yield [np.array(X_chars)], np.array(y)
        if i == len(self._train_samples):
          i = 0

    def _generate_val_samples(self):
      i = 0
      while True:
        X_chars = self._vectorizer.vectorize_chars(self._val_sentences[i:i + self._batch_size])
        y = self._vectorizer.vectorize_targets(self._val_sentences[i:i + self._batch_size])
        i += self._batch_size
        yield [np.array(X_chars)], np.array(y)
        if i == len(self._val_sentences):
          i = 0

    def train(self):
        weights_dir = '/content/gdrive/My Drive/Mezentseva Zavarzina/experiments_dl_model/weights_nastya'
        if not os.path.exists(weights_dir):
          os.makedirs(weights_dir)
        weights_file = 'weights_chars_crf_' + path_to_exp + '_.h5'
        modelPath = os.path.join(weights_dir, weights_file)
        saver = ModelCheckpoint(modelPath, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
        stopper = EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='auto')
        history = self._model.fit_generator(self._generate_train_samples(), epochs=120,
                            validation_data=self._generate_val_samples(), steps_per_epoch=self.steps_per_epoch, validation_steps=10,
                            verbose=1, callbacks=[saver, stopper])
        return history

In [0]:
trainer = Trainer()
trainer.train()

Теперь давайте попробуем найти временные выражения, используя обученную модель. Для этого напишем класс **Predictor**

In [0]:
from nltk.tokenize import WordPunctTokenizer


class Predictor:

    def __init__(self):
        self._vectorizer = Vectorizer()
        self._trainer = Trainer()
        self._tokenizer = WordPunctTokenizer()
        self._model = self._trainer._model
        self._model.load_weights('/content/gdrive/My Drive/Mezentseva Zavarzina/experiments_dl_model/weights_nastya/weights_chars_crf_' + path_to_exp + '_.h5')
        self._tag2id = {'O': 0, 'B-TIME': 1, 'I-TIME': 2, 'B-DATE': 3, 'I-DATE': 4, 'B-DURATION': 5, 'I-DURATION': 6,
                        'B-SET': 7, 'I-SET': 8}
        self._id2tag = self._get_id2tag()

    def _get_id2tag(self):
        id2tag = dict()
        for tag, id in self._tag2id.items():
            id2tag[id] = tag
        return id2tag

    def _process_input_sentence(self, sentence):
        tokens = self._tokenizer.tokenize(sentence)
        tokens_to_process = [(token, 'O') for token in tokens]
        input_tokens = []
        for i in range(20):
            input_tokens.append(tokens_to_process)
        X_chars = self._vectorizer.vectorize_chars(sentences=input_tokens)
        return X_chars, tokens

    def predict(self, text):
        X_chars, tokens = self._process_input_sentence(text)
        predicts = self._model.predict([X_chars])[0]
        result = []
        for i, token in enumerate(tokens):
            tag = self._id2tag[np.argmax(predicts[i])]
            result.append((token, tag))
        return result


In [10]:
predictor = Predictor()

Loading data: 100%|██████████| 256/256 [00:00<00:00, 372.16it/s]










Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1000, 30, 54)      0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 1000, 300)         246000    
_________________________________________________________________
crf_1 (CRF)                  (None, 1000, 9)           2808      
Total params: 248,808
Trainable params: 248,808
Non-trainable params: 0
_________________________________________________________________

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where












In [11]:
text = 'he was born in 1994'
result = predictor.predict(text)
for r in result:
    print(r)

('he', 'O')
('was', 'O')
('born', 'O')
('in', 'O')
('1994', 'B-DATE')


Разметка тестового датасета,  получение метрик

In [0]:
class Labeler:

  def __init__(self, path_to_data_dir):
        self._path_to_data_dir = path_to_data_dir
        self._data_files = sorted(os.listdir(self._path_to_data_dir))

 
 
  def label_dataset(self, column_name):
        """
        функция, которая записывает новую информацию в датасет
        :param column_name: имя столбца
        """
        for data_file in tqdm(self._data_files, desc='Adding data'):
            path_to_test_file = os.path.join(self._path_to_data_dir, data_file)
            print(1)
            with open(path_to_test_file, 'r') as data_f:
                reader = csv.DictReader(data_f)
                print(2)
                test_dir = result_dir + path_to_exp
                with open(os.path.join(test_dir, data_file), 'w', encoding='utf-8', newline='') as task:
                    print('ok')                  
                    fieldnames = ['token', 'tag', column_name]
                    writer = csv.DictWriter(task, fieldnames=fieldnames)
                    writer.writeheader()
                    predictions = list() 
                    data = list()
                    sentence = list()
                    for row in reader:
                      data.append((row['token'], row['tag']))
                      sentence.append(row['token'])
                    predictions = predictor.predict(' '.join(sentence))
                    for i, elem in enumerate(predictions):
                      token, tag = data[i]
                      token, pred_tag = elem 
                      writer.writerow({'token' : token, 'tag' : tag, column_name: pred_tag})

In [0]:
ml_label = Labeler(os.path.join(time_dir, 'test'))
tag = 'predicted_ml_tag'
ml_label.label_dataset(tag)

In [11]:
!pip install seqeval
from seqeval.metrics import classification_report



In [0]:
class Evaluator:
    """ Класс для оценки качества извлечения временных выражений """

    def __init__(self, path, column_pred):
        self._predicted_files_dir = path
        self._predicted_files = sorted(os.listdir(self._predicted_files_dir))
        self.column_pred = column_pred

    def evaluate(self) -> str:
        """ Оценка качества извлечения временных выражений
        :return: метрики: precision, recall, F-1 score для каждого класса отдельно и для всех усреднённые
        """
        preds, targets = self._load_predictons()
        report = classification_report(y_true=targets, y_pred=preds)
        return report

    def _load_predictons(self):
        """ Загружает targets и predictions из заранее сформированных csv файлов
        :return: predictions, targets
        """
        preds = []
        targets = []
        for predicted_file in self._predicted_files:
            with open(os.path.join(self._predicted_files_dir, predicted_file), 'r') as predicted_file:
                reader = csv.DictReader(predicted_file)
                preds_from_file = []
                targets_from_file = []
                for row in reader:
                    preds_from_file.append(row[str(self.column_pred)])
                    targets_from_file.append(row['tag'])
                preds.append(preds_from_file)
                targets.append(targets_from_file)
        return preds, targets


In [20]:
evaluator = Evaluator('/content/gdrive/My Drive/Mezentseva Zavarzina/experiments_dl_model/combined', 'predicted_combined_tag')
report = evaluator.evaluate()
print(report)

           precision    recall  f1-score   support

 DURATION       0.58      0.20      0.30        35
     DATE       0.75      0.78      0.76       101
     TIME       0.50      0.25      0.33         4
      SET       1.00      0.25      0.40         4

micro avg       0.73      0.61      0.66       144
macro avg       0.71      0.61      0.63       144

