## Configure environment

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

In [4]:
# Basic
import os
import re
import sys
import glob
import gc
import pickle
import psutil
import psycopg2
import datetime
import numpy as np
import pandas as pd

from pathlib import Path
from pprint import pprint
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict, namedtuple, deque, Counter
from typing import (List, Dict, Any, NoReturn, 
                    Tuple, Optional, Union)
from tqdm import tqdm
from datetime import datetime as dt
from sklearn.model_selection import train_test_split

import logging
import multiprocessing
from multiprocessing_logging import install_mp_handler
logging.basicConfig(level=logging.DEBUG,
                   format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

import warnings
warnings.filterwarnings('ignore')

In [5]:
# W2V
import spacy
import pymorphy2
from gensim.models import KeyedVectors
from gensim.models.phrases import Phrases
from nltk.stem.snowball import SnowballStemmer
from gensim.models.phrases import Phrases, npmi_scorer
from gensim.models import word2vec, keyedvectors

In [6]:
# Keras NN
%load_ext tensorboard
!rm -rf ./logs/

import tensorflow as tf
print(tf.__version__)
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam, SGD, schedules
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers

from tensorflow.python.client import device_lib

# Keras Addons
import tensorflow_addons as tfa
from tensorflow_addons.losses import metric_learning
from tensorflow_addons.utils.types import FloatTensorLike, TensorLike

"rm" ­Ґ пў«пҐвбп ў­гваҐ­­Ґ© Ё«Ё ў­Ґи­Ґ©
Є®¬ ­¤®©, ЁбЇ®«­пҐ¬®© Їа®Ја ¬¬®© Ё«Ё Ї ЄҐв­л¬ д ©«®¬.
I0721 15:20:22.686534 101160 tpu_cluster_resolver.py:35] Falling back to TensorFlow client; we recommended you install the Cloud TPU client directly with pip install cloud-tpu-client.


Couldn't import dot_parser, loading of dot files will not be possible.
2.3.0


In [7]:
import logging
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO)

### System & Devices

In [8]:
print(f"CPU count: {psutil.cpu_count()}")
print(f"CPU utilization: {psutil.cpu_percent()}%")
mem_stats = psutil.virtual_memory()
print(f"Memory total: {mem_stats.total / 1048576} MB.")
print(f"Memory available: {mem_stats.available / 1048576} MB.")
print(f"Memory used: {mem_stats.used / 1048576} MB.")

CPU count: 12
CPU utilization: 13.7%
Memory total: 16235.328125 MB.
Memory available: 7668.484375 MB.
Memory used: 8566.84375 MB.


In [9]:
def get_available_devices():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos]
print(get_available_devices()) 
device_lib.list_local_devices()

['/device:CPU:0', '/device:XLA_CPU:0', '/device:XLA_GPU:0']


[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 7232031686440726178,
 name: "/device:XLA_CPU:0"
 device_type: "XLA_CPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 11055582075212373904
 physical_device_desc: "device: XLA_CPU device",
 name: "/device:XLA_GPU:0"
 device_type: "XLA_GPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 589692531033251638
 physical_device_desc: "device: XLA_GPU device"]

### Define paths


In [10]:
BASE_DIR = Path("..")
DATA_DIR  = BASE_DIR / "data"
MODEL_DIR  = BASE_DIR / "models"
CHKP_DIR = MODEL_DIR / "chkp_dir"

## Load data

In [7]:
def load_vectors_data(start_date: str, end_date: str, is_prediction: Tuple[bool], 
                      last_updated: str, maxnum: int=1e6):
    
    print(f"Creating connection to DB...")
    connection = psycopg2.connect("dbname='{}' user='{}' host='{}' password='{}' port = {}".format(
        'pgarmat', 'postgres', 'tklis-sm0000002.vm.mos.cloud.sbrf.ru', 'ZxqZo8lX', '5432'))
    print(f"Connection created.")
    
    with connection.cursor() as cursor:
        try:
            cursor.execute(f"""
                select
                req_num, 
                req_reg_datetime, 
                msg,
                product, 
                subject,
                s_subject, 
                subproduct,
                client_msg_vector,
                update_datetime
                from ds70.request_messages
                where 1=1
                and client_msg_vector is not null
                and is_prediction in {is_prediction}
                and req_reg_datetime between '{start_date}' and '{end_date}'
                and update_datetime >= '{last_updated}'
                limit {maxnum};""")
            requests = cursor.fetchall()
        except Exception as ee:
            print(ee)
            connection.rollback()
            
        data = pd.DataFrame(requests, columns=["req_num",
                                               "req_reg_datetime",
                                               "msg",
                                               "product",
                                               "subject",
                                               "s_subject",
                                               "subproduct",
                                               "client_msg_vector",
                                               "update_datetime"])
        print(f"Loaded data shape: {data.shape} and size: {sys.getsizeof(data) / 1048576} MB.")
        return data

In [None]:
df = load_vectors_data(start_date='2021-02-01 0:00:01', end_date='2021-02-10 23:59:59', 
                       is_prediction=(True, False), last_updated='2021-07-19', maxnum=600_000)

### Configure target values

In [None]:
df['label_name'] = df['product'] + " : " + df['s_subject']
print(df['label_name'].nunique())

#### Select most frequent classes

In [None]:
freq_labels = df.label_name.value_counts().reset_index().rename({'index': "label_name_name"}, axis=1)
print(freq_labels.describe())

freq_labels.head(10)

In [None]:
# Take only most frequent subjects
n_freq_labels = 50

freq_labels = freq_labels.loc[freq_labels.label_name > n_freq_labels]
print("Lables left: ", freq_labels.shape[0])
freq_labels_list = freq_labels.label_name_name.to_list()

df = df.loc[df.label_name.isin(freq_labels_list)]
print(f"Left {len(df)} requests.")

### Train/test split

In [None]:
train_df, test_df = train_test_split(df, test_size=0.3, 
                                     random_state=11, stratify=df.label_name)
test_df, val_df = train_test_split(test_df, test_size=0.4, 
                                   random_state=11, stratify=test_df.label_name)
print(f"Train data: {train_df.shape[0]} test data: {test_df.shape[0]}, val data: {val_df.shape[0]}")

## Vectorizing

In [None]:
class DataVectorizer:
 
    def __init__(self, phrase_model: Phrases = None,
                 w2v_vectorizer: Optional[KeyedVectors] = None):
        """
        Need to load pre-trained models from files.
        """
        self._w2v = w2v_vectorizer
        self._phrase_model = phrase_model
        self._vocab_size, self._embedding_size = self._w2v.vectors.shape
 
       
    @classmethod
    def load(cls, save_path: str, phrases_fn: str, w2v_fn: str) -> object:
        """
        Load pre-trained models from files and init.
        """
        phrase_model = Phrases.load(os.path.join(save_path, phrases_fn))
        w2v_vectorizer = KeyedVectors.load_word2vec_format(os.path.join(save_path, w2v_fn), binary=False)
        return cls(phrase_model=phrase_model,
                   w2v_vectorizer=w2v_vectorizer)
   
    
    def _word2id(self, token_seq: List[List[str]], max_len: int):
        """
        Convert token to id in W2V vocabulary.
        """
        id_seq = [[self._w2v.get_index(tok) + 1 for tok in doc if self._w2v.has_index_for(tok)]
                  for doc in tqdm(token_seq)]
#         id_seq = pad_sequences(id_seq, value=0, maxlen=max_len, padding='post', truncating='post')
        return id_seq
 
   
    def vectorize(self, data: List[str], max_len: int) -> List[int]:
        """
        Vectorize data:
            - encode words as ids
            - reduce dimensions
        """
        data = [self._phrase_model[d] if len(d) > 0 else np.zeros((max_len,)) for d in data]
        data = self._word2id(data, max_len)
        return data
   
    def get_vectors(self):
        return self._w2v.vectors
    
    
dvect = DataVectorizer.load(str(MODEL_DIR), phrases_fn="phrases_gensim_lemmas_12_07_2021.pkl",
                           w2v_fn='word2vec_gensim_lemmas_12_07_2021')

### Prepare data for training

In [None]:
le = LabelEncoder().fit(train_df.label_name)
train_df['label'] = le.transform(train_df.label_name)
test_df['label'] = le.transform(test_df.label_name)
val_df['label'] = le.transform(val_df.label_name)

In [None]:
train_labels = train_df['label'].values
test_labels = test_df['label'].values
val_labels = val_df['label'].values

train_vectors = pad_sequences(train_df['client_msg_vector'].to_list(), value=0, maxlen=150, padding='post', truncating='post')
test_vectors = pad_sequences(test_df['client_msg_vector'].to_list(), value=0, maxlen=150, padding='post', truncating='post')
val_vectors = pad_sequences(val_df['client_msg_vector'].to_list(), value=0, maxlen=150, padding='post', truncating='post')

train_indexes = [i for i, vec in enumerate(train_vectors) if np.count_nonzero(vec) > 19]
test_indexes = [i for i, vec in enumerate(test_vectors) if np.count_nonzero(vec) > 19]
val_indexes = [i for i, vec in enumerate(val_vectors) if np.count_nonzero(vec) > 19]

train_vectors, train_labels = train_vectors[train_indexes], train_labels[train_indexes]
test_vectors, test_labels = test_vectors[test_indexes], test_labels[test_indexes]
val_vectors, val_labels = val_vectors[val_indexes], val_labels[val_indexes]

train_vectors = np.vstack(train_vectors)
test_vectors = np.vstack(test_vectors)
val_vectors = np.vstack(val_vectors)

## Model architectures

### Transformer Model

In [11]:
class TokenAndPositionEmbedding(layers.Layer):
    """
    Two seperate embedding layers, one for tokens, one for token index (positions).
    """
    def __init__(self, maxlen: int, vocab_size: int, 
                 embedding_size: int, pretrained_weights: np.ndarray):
        super(TokenAndPositionEmbedding, self).__init__()
        self._token_emb = layers.Embedding(input_dim=vocab_size + 1, 
                                           output_dim=embedding_size,
                                           weights=[np.vstack((np.zeros((1, embedding_size), dtype=np.float32), 
                                                    pretrained_weights))], input_length=maxlen,
                                          mask_zero=True, trainable=True)
        self._pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embedding_size)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self._pos_emb(positions)
        x = self._token_emb(x)
        return x + positions

In [12]:
class TransformerBlock(layers.Layer):
    
    def __init__(self, embedding_size: int, num_heads: int, out_dim: int, dropout=0.5):
        super(TransformerBlock, self).__init__()
        # If query, key, value are the same, then this is self-attention. 
        # Each timestep in query attends to the corresponding sequence in key, and returns a fixed-width vector.
        self._att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_size)
        self._ffn = Sequential([
             layers.Dense(out_dim, activation="relu"), 
             layers.Dense(embedding_size)
        ])
        self._layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self._layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self._dropout1 = layers.Dropout(dropout)
        self._dropout2 = layers.Dropout(dropout)

    def call(self, inputs, is_training: bool):
        # query, value, [key] optional == value
        # attention_output of shape [B, T, E] (if return_attention_scores=False)
        attn_output = self._att(inputs, inputs)
        attn_output = self._dropout1(attn_output, training=is_training)
        out1 = self._layernorm1(inputs + attn_output)
        ffn_output = self._ffn(out1)
        ffn_output = self._dropout2(ffn_output, training=is_training)
        return self._layernorm2(out1 + ffn_output), attn_output

In [18]:
class TransformerModel():
    
    def __init__(self, num_heads: int,
                 input_length: int, embedding_size: int, 
                 inner_dims: int, out_dims: int, vocab_size: int,
                 pretrained_weights: np.ndarray, dropout: float=0.5, 
                 pretrained=False, model_weights_path='./model.hdf5'):
        
        self._num_heads = num_heads  # Number of attention heads
        self._input_length = input_length  # Number of tokens in text (~150)
        self._pretrained_weights = pretrained_weights
        assert(self._pretrained_weights.shape[0] == vocab_size, 
               f"Pretrained weights size and vocabulary size should be equal!")
        self._vocab_size = vocab_size  # Embeddings vocab size
        assert(self._pretrained_weights.shape[1] == embedding_size, 
               f"Pretrained weights size and embedding size should be equal!")
        self._embedding_size = embedding_size  # Embedding size for each token
        self._out_dims = out_dims  # Output text embedding vector size
        self._inner_dims = inner_dims # Hidden layer size in feed forward network inside transformer
        self._dropout = dropout
        self._model, self._attention_model = self.create_network()
        if pretrained:
            self.load_pretrained(model_weights_path)
            
    def create_network(self):
        input_layer = layers.Input((self._input_length,))
        embedding_layer = TokenAndPositionEmbedding(maxlen=self._input_length, 
                                                    vocab_size=self._vocab_size, 
                                                    embedding_size=self._embedding_size,
                                                    pretrained_weights=self._pretrained_weights)(input_layer)
        transformer_block = TransformerBlock(embedding_size=self._embedding_size, 
                                             num_heads=self._num_heads, 
                                             out_dim=self._inner_dims, 
                                             dropout=self._dropout)
        transformer_out, transformer_att = transformer_block(embedding_layer)
        avgpool = layers.GlobalAveragePooling1D()(transformer_out)
        drop = layers.Dropout(0.1)(avgpool)
        dense = layers.Dense(self._out_dims, activation="relu")(drop)
        output_layer = Lambda(lambda x: tf.math.l2_normalize(x, axis=1))(dense)
        att_model = Model(input_layer, [output_layer, transformer_att])
        model = Model(input_layer, output_layer)
        return model, att_model
            
    def load_pretrained_model(self, model_weights_path: str):
        self._model.load_weights(model_weights_path)
        
    def load_pretrained_attention_model(self, model_weights_path: str):
        self._attention_model.load_weights(model_weights_path)

In [19]:
model = TransformerModel(num_heads=2, input_length=150, embedding_size=200,  #dvect._embedding_size, 
                        inner_dims=128, out_dims=50, vocab_size=10_000,  #dvect._vocab_size,
                        pretrained_weights=np.zeros((10_000, 200), dtype=float),  #dvect.get_vectors(), 
                         dropout=0.6,
                        pretrained=False, model_weights_path=str(MODEL_DIR / "attention_lstm_50dim_13-07.h5"))

model._attention_model.summary()

AttributeError: module 'tensorflow.keras.layers' has no attribute 'MultiHeadAttention'