In [None]:
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [None]:

import analyser.hyperparams
from analyser.hyperparams import work_dir 
from analyser.hyperparams import HyperParameters
 

work_dir_default = os.path.realpath(os.path.join(  analyser.hyperparams.__file__, '..', '..', '..', 'work'))
work_dir = os.environ.get('GPN_WORK_DIR', work_dir_default)
print('work_dir=', work_dir)
if not os.path.isdir(work_dir):
  os.mkdir(work_dir)

analyser.hyperparams.work_dir = work_dir

assert os.path.isdir(analyser.hyperparams.work_dir)

# Imports

In [None]:
%matplotlib inline
import tensorflow as tf
print(tf.__version__)

import seaborn as sns
import matplotlib.pyplot as plt
from IPython.core.display import display, HTML

from colab_support.renderer import *
# from tensorflow_docs import plots

import random

import pickle
import numpy as np
import pandas as pd
from analyser.legal_docs import LegalDocument, make_headline_attention_vector
from analyser.headers_detector import make_predicted_headline_attention_vector
import math

import os
from os import path
from trainsets.trainset_tools import TrainsetBalancer, SubjectTrainsetManager
 

from trainsets.retrain_contract_uber_model import DbJsonDoc, UberModelTrainsetManager
from tf_support.super_contract_model import get_base_model


from keras.utils.vis_utils import model_to_dot
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from IPython.display import SVG
from keras.models import load_model
import warnings

from tensorflow import keras
from keras.layers import Conv1D, LSTM, GRU, BatchNormalization, TimeDistributed, Dense, Bidirectional, Input, Dropout, Lambda
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, CSVLogger
from keras.models import Sequential, Model
from keras.layers import concatenate, SpatialDropout1D, ActivityRegularization
from keras.layers import MaxPooling1D, Activation, ThresholdedReLU, GaussianNoise
from keras.utils import plot_model
from keras.preprocessing.sequence import pad_sequences

from tf_support.tools import KerasTrainingContext


import keras.backend as K



# Prepare trainset


In [None]:
umtm = UberModelTrainsetManager ( analyser.hyperparams.work_dir)

umtm.import_recent_contracts()
umtm.calculate_samples_weights()
umtm.validate_trainset()

In [None]:
umtm.stats = umtm.stats[  pd.isna(umtm.stats.value_span) + (umtm.stats.value_span < 10000) ] #remove big docs from TS
umtm.stats

In [None]:
%matplotlib inline

subj_count = umtm.stats['subject'].value_counts()

#plot distribution---------------------
sns.barplot(subj_count.values, subj_count.index)
plt.title('Frequency Distribution of subjects')
plt.xlabel('Number of Occurrences')
plt.show()


print ('\nmin', min (subj_count.values))
print ('max', max (subj_count.values))
print ('total', sum (subj_count.values))

In [None]:
from sklearn.utils import class_weight
from trainsets.trainset_tools import get_feature_log_weights

_classes = umtm.stats['subject'].unique().tolist()

print(f'classes: {_classes}')

# class_weights = class_weight.compute_class_weight('balanced',
#                                                 _classes,
#                                                 umtm.stats['subject'])
# class_weights = dict(zip(_classes, class_weights))
# class_weights

class_weights = get_feature_log_weights(umtm.stats, 'subject')
class_weights

In [None]:
from trainsets.trainset_tools import get_feature_log_weights

def calculate_samples_weights(self):

  self.stats: DataFrame = self.load_contract_trainset_meta()
  subject_weights = get_feature_log_weights(self.stats, 'subject')
  
  value_median = self.stats.value_log1p.median()

  for i, row in self.stats.iterrows():
    subj_name = row['subject']

    tagging_weight = 1.0
    if not pd.isna(row['user_correction_date']):  # MORE weight for user-corrected datapoints
      tagging_weight = 10.0  # TODO: must be estimated anyhow smartly

    value_weight = value_median
    if not pd.isna(row['value_log1p']):
      # вес пропорционален логорифму цены контракта,
      # чтобы было меньше ошибок в контрактах на большие суммы)
      value_weight = row['value_log1p']

    tagging_weight *= value_weight
    subject_weight = tagging_weight * class_weights[subj_name]
    self.stats.at[i, 'subject_weight'] = subject_weight
    self.stats.at[i, 'sample_weight']  = tagging_weight

  # normalize weights, so the sum == Number of samples
  self.stats.sample_weight /= self.stats.sample_weight.mean()
  self.stats.subject_weight /= self.stats.subject_weight.mean()

  self._save_stats()

calculate_samples_weights(umtm)

plt.figure(figsize=(13, 6))

umtm.stats['subject_weight'].hist(bins=20)
umtm.stats['sample_weight'].hist(bins=20)

plt.xscale('linear') # log?
plt.show()

In [None]:
sns.jointplot(x="subject_weight", y="sample_weight", data=umtm.stats )
plt.show()


### look into trainset

In [None]:
# umtm.calculate_samples_weights()
SAMPLE_DOC_ID = umtm.stats.index[0]
print('SAMPLE_DOC_ID', SAMPLE_DOC_ID)
dp = umtm.make_xyw(SAMPLE_DOC_ID)
(emb, tok_f), (sm, subj), (sample_weight, subject_weight) = dp
 

In [None]:
%matplotlib inline

plot_embedding(tok_f[:500], title=f'Tokens features {SAMPLE_DOC_ID}') 
plot_embedding(emb[:500], title=f'Embedding {SAMPLE_DOC_ID}') 
plot_embedding(sm[:500], title=f'Semantic map {SAMPLE_DOC_ID}')

## Batch generator & TODOs 🙏


- [X] TODO: add outliers to the trainset ?
- [ ] TODO: try sparse_categorical_entropy instead of one-hot encodings
- [ ] TODO: model 5.2, 5.1: bipolar concat layer is wrong because we concatenate thongs of different magnitudes. Add a Sigmoid activation layer
- [ ] TODO: chechk what is better: to pad with zeros or to pad with means
- [X] TODO: add weights to samples
- [ ] TODO: sum semantic map alongside vertical axis, and mutiply it (as a mask) by the subject detection seq

In [None]:
def make_generator(self, indices: [int], batch_size: int, augment_samples=False):

  np.random.seed(42)

  while True:
    # next batch
    batch_indices = np.random.choice(a=indices, size=batch_size)

    max_len = 128 * 12
    start_from = 0

    if augment_samples:
      max_len =  random.randint(300, 1400)

    batch_input_emb = []
    batch_input_token_f = []
    batch_output_sm = []
    batch_output_subj = []

    weights = []
    weights_subj = []

    # Read in each input, perform preprocessing and get labels
    for doc_id in batch_indices:

      dp = self.make_xyw(doc_id)
      (emb, tok_f), (sm, subj), (sample_weight, subject_weight) = dp

      subject_weight_K=1.0
      if augment_samples:
        start_from = 0
        
        row = self.stats.loc[doc_id]
        if random.randint(1, 2) == 1:  # 50% of samples
          segment_center = random.randint(0, len(emb)-1) ##select random token as a center
          if not pd.isna(row['value_span']) and random.random()<0.7:        
            segment_center = int(row['value_span'])

          _off = random.randint(max_len // 4, max_len // 2)
          start_from = segment_center - _off
          if start_from < 0:
            start_from = 0
          subject_weight_K = 0.1 #lower subject weight because there mighе be no information about subject around doc. value

      dp = self.trim_maxlen(dp, start_from, max_len)
      # TODO: find samples maxlen

      (emb, tok_f), (sm, subj), (sample_weight, subject_weight) = dp
      subject_weight *= subject_weight_K

      batch_input_emb.append(emb)
      batch_input_token_f.append(tok_f)

      batch_output_sm.append(sm)
      batch_output_subj.append(subj)

      weights.append(sample_weight)
      weights_subj.append(subject_weight)
      # end if emb
    # end for loop

    # Return a tuple of (input, output, weights) to feed the network
    yield ([np.array(batch_input_emb), np.array(batch_input_token_f)],
            [np.array(batch_output_sm), np.array(batch_output_subj)],
            [np.array(weights), np.array(weights_subj)])

In [None]:
EPOCHS = 50
BATCH_SIZE = 24
EMB =  1024
 
_SELFTEST = True



_train, _test = train_test_split(umtm.stats, test_size=0.2, stratify=umtm.stats[['subject']])
train_indices = _train.index
test_indices = _test.index

# train_indices, test_indices = split_trainset_evenly(umtm.stats, 'subject', seed=5)
print('train_indices[0]:', train_indices[0])
print('test_indices[0]:', test_indices[0])

%config InlineBackend.figure_format='retina'

def plot_subject_distr(df, title):  
  target='subject'
  plt.figure(figsize=(16,4))   
  sns.set(style="whitegrid")
  chart = sns.countplot(data=df, y=target)
  plt.title(f'Frequency Distribution of subjects :{title}')

 
plot_subject_distr(umtm.stats, 'ALL')
plot_subject_distr(umtm.stats[umtm.stats.index.isin(train_indices)], 'train')
plot_subject_distr(umtm.stats[umtm.stats.index.isin(test_indices)], 'test')


if _SELFTEST:
  # test_gen = make_generator(umtm, test_indices, BATCH_SIZE)
  train_gen = make_generator(umtm, train_indices, BATCH_SIZE, augment_samples=True)
  
  x, y, w = next(train_gen)
  
  print('X:', len(x), 'X[0]=', x[0].shape, 'X[1]=', x[1].shape)
  print('Y:', len(y), 'Y[0]=', y[0].shape, 'Y[1]=', y[1].shape)
  

  plot_embedding(x[0][0], 'X2: Token Embeddings')
  plot_embedding(x[1][0], 'X1: Token Features')
  plot_embedding(y[0][0], 'Y: Semantic Map')
  
  print(y[0][1])

  # del x 5edbc665da3678279fbcaf1c
  del y
  del train_gen

In [None]:

ctx = KerasTrainingContext(umtm.work_dir, session_index=21)

ctx.set_batch_size_and_trainset_size(BATCH_SIZE, 
                                     len(test_indices), 
                                     4 * len(train_indices))

DEFAULT_TRAIN_CTX = ctx
CLASSES = 43
FEATURES = 14

metrics = ['kullback_leibler_divergence', 'mse', 'binary_crossentropy']


def make_all_generators(): 
  all_val_generator = make_generator(umtm, test_indices + train_indices, BATCH_SIZE)
  test_generator = make_generator(umtm, test_indices, BATCH_SIZE)
  train_generator = make_generator(umtm, train_indices, BATCH_SIZE, augment_samples=True)

  return train_generator, test_generator, all_val_generator

def train(umodel):
  test_gen = make_generator(umtm, test_indices, BATCH_SIZE)
  train_gen = make_generator(umtm, train_indices, BATCH_SIZE, augment_samples=True) 
  ctx.train_and_evaluate_model(umodel, generator=train_gen, test_generator=test_gen)

def overtrain(umodel):
  test_gen = make_generator(umtm, train_indices+test_indices, BATCH_SIZE)
  train_gen = make_generator(umtm, train_indices+test_indices, BATCH_SIZE, augment_samples=True) 
  ctx.train_and_evaluate_model(umodel, generator=train_gen, test_generator=test_gen)



# Models 🦖

### 5.1.1 💕💕 uber_detection_model_005_1_1
```
0.0590: val_O1_tagging_kullback_leibler_divergence
0.0765: val_O1_tagging_kullback_leibler_divergence: 
0.0019: val_O1_tagging_mse
0.0315: val_O2_subject_loss
```

In [None]:
from tf_support.super_contract_model import uber_detection_model_005_1_1
umodel = ctx.init_model( uber_detection_model_005_1_1, verbose=2, trained=True  )
# umodel = ctx.init_model( uber_detection_model_005_1_1, verbose=2, trained=True, weights_file_override='/content/uber_detection_model_005_1_1' )

# plot_model(umodel, show_shapes=True)

In [None]:
ctx.EPOCHS = 21
ctx.EVALUATE_ONLY = False
#TODO: freeze bottom layers
train(umodel)

In [None]:
# ctx.EPOCHS = 300  
# ctx.EVALUATE_ONLY=False
# overtrain(umodel)

# Evaluate models

### training history

In [None]:
 
def plot_compare_models(models, metrics, title="metric/epoch"):
  colorstep = float(1.0 / len(models))
  power = 3
  fig = plt.figure(figsize=(16, 6))
  ax = fig.gca()
  # ax.set_facecolor((0, 0, 0.1))

  for i, m in enumerate(models):
    data = ctx.get_log(m)
    if data is not None:
      data.set_index('epoch')
      for metric in metrics:

        key = "val_" + metric
        if key in data:
          x = data['epoch'][2:]
          y = data[key][2:]
          c = plt.cm.Dark2(i * colorstep)
          plt.plot(y, label=f'{m} {key}', alpha=0.2, color=c)
          y = y.rolling(4, win_type='gaussian').mean(std=4)

          plt.plot(y, label=f'{m} {key}', color=c)
    else:
      print('cannot plot')
    plt.legend(loc='upper right')

  plt.title(title)
  plt.grid()
  plt.show()


models = list(ctx.trained_models.keys())
# print(models)

plot_compare_models(models, ['loss'], 'Loss')

plot_compare_models(models, ['O1_tagging_kullback_leibler_divergence'], 'TAGS: Kullback Leibler divergence')
plot_compare_models(models, ['O1_tagging_mse'], 'TAGS: MSE')
plot_compare_models(models, ['O2_subject_kullback_leibler_divergence'], 'Subj: Kullback Leibler divergence')
plot_compare_models(models, ['O2_subject_mse'],  'Subjects: MSE')

plot_compare_models(models, ['O1_tagging_loss', 'O2_subject_loss'], 'Loss')

### Evaluate recent model

In [None]:
train_generator, test_generator, all_val_generator = make_all_generators()
print(umodel.name)
subsets=['all', 'test', 'train']
ev = pd.DataFrame()#(columns=umodel.metrics_names)
for i, _gen in enumerate([all_val_generator, test_generator, train_generator]):
  evaluation = umodel.evaluate_generator(_gen, verbose=2, steps=16)
  for a, b in zip(umodel.metrics_names, evaluation):
    ev.at[a, umodel.name+f"--{subsets[i]}"] = b
ev

## Confusion matrices

In [None]:
from tf_support.tf_subject_model import decode_subj_prediction


def plot_cm(y_true, y_pred, figsize=(12, 12), title=None):
  cm = confusion_matrix(y_true, y_pred, labels=np.unique(y_true))
  cm_sum = np.sum(cm, axis=1, keepdims=True)
  cm_perc = cm / cm_sum.astype(float) * 100
  annot = np.empty_like(cm).astype(str)
  nrows, ncols = cm.shape
  for i in range(nrows):
    for j in range(ncols):
      c = cm[i, j]
      p = cm_perc[i, j]
      if i == j:
        s = cm_sum[i]
        annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
      elif c == 0:
        annot[i, j] = ''
      else:
        annot[i, j] = '%.1f%%\n%d' % (p, c)

      # cm[i, j] = cm_perc[i, j]

  cm = pd.DataFrame(cm_perc, index=np.unique(y_true), columns=np.unique(y_true))
  cm.index.name = 'Actual'
  cm.columns.name = 'Predicted'
  fig, ax = plt.subplots(figsize=figsize)
  sns.heatmap(cm, cmap="YlGnBu", annot=annot, fmt='', ax=ax)
  plt.title(title)


def report_confusion_matrix(umodel, indices):
  errors_report = pd.DataFrame()
  errors_report['expected'] = ''
  errors_report['predicted'] = ''

  all_expected = []
  all_predicted = []

  for _id in indices:

    x, y, _ = umtm.make_xyw(_id)
    embeddings = x[0]
    token_features = x[1]
    prediction = umodel.predict(x=[np.expand_dims(embeddings, axis=0), np.expand_dims(token_features, axis=0)],
                                batch_size=1)

    subj_1hot = prediction[1][0]

    expected = decode_subj_prediction(y[1])[0]
    predicted = decode_subj_prediction(subj_1hot)[0]
    all_expected.append(expected.name)
    all_predicted.append(predicted.name)

    if expected != predicted:
      errors_report.at[_id, 'expected'] = expected
      errors_report.at[_id, 'predicted'] = predicted

      

  plot_cm(all_expected, all_predicted, title=umodel.name)

  report = classification_report(all_expected, all_predicted, digits=3)
  print(umodel.name)
  print(report)

  return errors_report
 

# subset = umtm.stats[~pd.isna(umtm.stats['user_correction_date'])].sort_values('analyze_date')[:50] 
subset = umtm.stats[~pd.isna(umtm.stats['user_correction_date'])].sort_values('analyze_date')
errors_report = report_confusion_matrix(umodel, subset.index)

In [None]:
print(len(errors_report), 'wrong subjects of', len(subset))
errors_report

## Single doc eval

In [None]:
# from analyser.finalizer import get_doc_by_id

# a = get_doc_by_id('ObjectId(5dee80604ddc27bcf92dd88e)')
# print(a)

from integration.db import get_mongodb_connection
from bson.objectid import ObjectId

SAMPLE_DOC_ID = umtm.stats.index[0]
print('SAMPLE_DOC_ID', SAMPLE_DOC_ID)
dp = umtm.make_xyw(SAMPLE_DOC_ID)
(emb, tok_f), (sm, subj), (sample_weight, subject_weight) = dp


print(f'fetching {SAMPLE_DOC_ID}')
db = get_mongodb_connection()
documents_collection = db['documents']
# print(documents_collection)
jdata =  documents_collection.find_one({'_id': ObjectId(SAMPLE_DOC_ID)})
jdoc = DbJsonDoc(jdata)

In [None]:
from integration.word_document_parser import join_paragraphs



def asLegalDoc(self):
  doc: LegalDocument = join_paragraphs(self.parse, self._id)
  return doc

doc = asLegalDoc(jdoc) #???


In [None]:
from tf_support.super_contract_model import seq_labels_contract 
from tf_support.tf_subject_model import decode_subj_prediction

###############
prediction = umodel.predict(   x=[  np.expand_dims(emb, axis=0), np.expand_dims(tok_f, axis=0)] , batch_size=1)
##############
print(len(prediction), umodel.name)
tagging = prediction[0][0]

subj_1hot = prediction[1][0]
print('Subject:', decode_subj_prediction(subj_1hot))


df = pd.DataFrame(tagging, columns=seq_labels_contract)

plot_embedding(df, title = f'Predictions of {umodel.name}')
# display(HTML(render_doc (doc, df)))


In [None]:
from analyser.text_tools import find_top_spans

for t in seq_labels_contract:
  spans = list( find_top_spans( df[t].values, threshold=0.3))  
  print(t.upper(), spans)
  display(HTML(render_slices(spans, doc, df[t].values)))


In [None]:
mean_ = df.values.max(-1)*0.5
print (mean_.shape)
display(HTML( to_color_text (doc.tokens,  mean_)))