# Imports

In [None]:
TEST_DOC_ID = '612cdb1dea1085618e02fee3'

In [None]:
import logging
import platform
import sys
import os
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display, Markdown
from datetime import datetime
import tensorflow as tf

logger = logging.getLogger('retrain_ipynb')
logger.setLevel(logging.DEBUG)


ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(levelname)s - %(asctime)s - %(name)s - %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
logger.debug('--=logging started=--')

print(tf.__version__)
CPU = platform.processor()
print (f'Running on CPU:{CPU}')

In [None]:
nb_dir = os.path.split(os.getcwd())[0]

if nb_dir not in sys.path:
    sys.path.append(nb_dir)

import analyser.hyperparams 
analyser.hyperparams.__file__

### Imports...

In [None]:
%matplotlib inline

import pandas as pd
from pandas import DataFrame

from analyser.finalizer import get_doc_by_id
from analyser.persistence import DbJsonDoc
from integration.db import get_mongodb_connection

from analyser.ml_tools import SemanticTag, relu
from analyser.runner import save_analysis 

from analyser.hyperparams import models_path

from pymongo import ASCENDING

In [None]:
os.environ.get('GPN_DB_HOST', 'localhost') 

## Embeddings

In [None]:
from tf_support.embedder_elmo import ElmoEmbedder

embedder = ElmoEmbedder.get_instance()

In [None]:
mean_len = 400 #TODO: move to config

In [None]:
models_path

# Analysing sample doc

In [None]:
import numpy as np
from pathlib import Path
from bson import ObjectId
from sklearn.metrics import pairwise_distances


centroids = np.load(Path(models_path) / "insides_patterns.npy")
print(centroids.shape)


n_clusters = centroids.shape[0]

In [None]:
print(TEST_DOC_ID)
sample_id     = ObjectId(TEST_DOC_ID) 
sample_db_doc = get_doc_by_id(sample_id)
# print(sample_db_doc)

In [None]:
from analyser.legal_docs import tokenize_doc_into_sentences_map


sample_j_doc  = DbJsonDoc(sample_db_doc)
sample_doc    = sample_j_doc.asLegalDoc()



sample_doc.sentence_map = tokenize_doc_into_sentences_map(sample_doc.tokens_map.get_full_text(), mean_len)



doc_embeddings = embedder.embedd_strings(sample_doc.sentence_map.tokens)

In [None]:
print(sample_doc.attributes_tree.__dict__)

In [None]:
plt.figure(figsize=(20,10))
plt.imshow(doc_embeddings)

In [None]:
X = doc_embeddings
distance_matrix = pairwise_distances(X, centroids, metric='cosine', n_jobs=1)
# distance_matrix = relu ( ((distance_matrix * -1)+1) , _mx-0.01)

distance_matrix = (distance_matrix * -1)+1.0
distance_matrix = distance_matrix.T
plt.figure(figsize=(30,4))
plt.imshow( distance_matrix )
# plt.plot(np.array(distance_matrix.T[0]))
print(len(distance_matrix[0]))

In [None]:

#TODO: move to analyser code
# sample_doc.attributes_tree.__dict__.update(sample_j_doc.analysis['attributes_tree']['contract'])
print(distance_matrix.std())
print(distance_matrix.mean()) 

In [None]:


eps = 0.01
threshold = 0.82 #0.9 *  distance_matrix.max()
print('threshold', threshold)
print()
sim_max = threshold
i_max = 0
plt.figure(figsize=(30,6))
plt.plot([threshold]*len(sample_doc.sentence_map), alpha=0.4 )
for k in range(n_clusters):    
    print('-'*20)
 
    av = distance_matrix[k]  #relu(v, threshold) ## attention vector
    
    ii = av.argmax()
    
    
 
    
#     if (sim > threshold):
        
#         print (sample_doc.tokens_map.tokens[_span[0]:_span[1]  ] )
        
        
    if av[ii] > sim_max:
        plt.plot(av, alpha=0.5)
        
        print( f"{k}=cluster \t {av[ii]}=similarity, \n {sample_doc.sentence_map.tokens[ii]} ")
        char_span =  sample_doc.sentence_map.map[ii]
        
        
        _span = sample_doc.sentence_map.remap_span((ii, ii+1), sample_doc.tokens_map)
        print("span (chars):", char_span, _span)
        tag = SemanticTag( 'insideInformation','Unknown', span=_span, confidence=np.float(av[ii]))
        print(tag)
        
        i_max = k
        sim_max = av[ii]
        
        setattr(sample_doc.attributes_tree, "insideInformation", tag)
plt.plot(distance_matrix.mean(axis=0), alpha=0.5, color='black')  
plt.plot(distance_matrix[i_max] , alpha=0.9, color='red')  


mean_ = distance_matrix.mean( )
plt.plot([mean_]*len(sample_doc.sentence_map), alpha=0.4 )

print(sim_max, i_max)



In [None]:


# save_analysis(sample_j_doc, sample_doc, state = sample_j_doc.state )

In [None]:
plt.figure(figsize=(30,6))
relu_threshold =  distance_matrix.mean(axis=0).max()*0.99
plt.plot([relu_threshold]*len(sample_doc.sentence_map), alpha=0.4 )
plt.plot(distance_matrix.mean(axis=0))

In [None]:
from colab_support.renderer import HtmlRenderer
import matplotlib as matplotlib
from IPython.core.display import display, HTML


class DemoRenderer(HtmlRenderer):
  def render_color_text(self, tokens, weights, colormap='coolwarm', print_debug=False, _range=None, separator=' '):
    html = self.to_color_text(tokens, weights, colormap, print_debug, _range, separator=separator)
    display(HTML(html))

  def to_color_text(self, tokens, weights, colormap='coolwarm', print_debug=False, _range=None, separator=' '):
    return super()._to_color_text(tokens, weights, matplotlib, colormap=colormap, _range=_range, separator=separator)

renderer_ = DemoRenderer()


renderer_.render_color_text(sample_doc.sentence_map.tokens, relu(distance_matrix[i_max]-distance_matrix.mean(axis=0), 0.1), _range=(0,1))