# Imports

In [None]:
IMPORT_FRESH_ONLY = True # re-import all if False

SELF_TEST = True


In [None]:
import logging
import platform
import sys
import os
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display, Markdown
from datetime import datetime
import tensorflow as tf

logger = logging.getLogger('retrain_ipynb')
logger.setLevel(logging.DEBUG)


ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(levelname)s - %(asctime)s - %(name)s - %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
logger.debug('--=logging started=--')

print(tf.__version__)
CPU = platform.processor()
print (f'Running on CPU:{CPU}')

In [None]:
nb_dir = os.path.split(os.getcwd())[0]

if nb_dir not in sys.path:
    sys.path.append(nb_dir)

import analyser.hyperparams 
analyser.hyperparams.__file__

### Prepare workdir

In [None]:
from pathlib import Path
_work_dir_default = Path(analyser.hyperparams.__file__).parent.parent.parent / 'work'
work_dir = os.environ.get('GPN_WORK_DIR', _work_dir_default)

if not os.path.isdir(work_dir):
    os.mkdir(work_dir)

analyser.hyperparams.work_dir = work_dir
 

print('work_dir=', analyser.hyperparams.work_dir)
assert os.path.isdir(analyser.hyperparams.work_dir)

### Imports...

In [None]:
%matplotlib inline

import pandas as pd
from pandas import DataFrame

from analyser.finalizer import get_doc_by_id
from analyser.persistence import DbJsonDoc
from integration.db import get_mongodb_connection
from pymongo import ASCENDING

### Import docs having insideInformation

In [None]:

db = get_mongodb_connection()
documents_collection = db['documents']
sorting = [('analysis.analyze_timestamp', ASCENDING), ('user.updateDate', ASCENDING)]

query = {
  '$and': [
#     {"parse.documentType": "CONTRACT"},      
#     {"state": 15},
    {'$or': [
        {"user.attributes_tree.contract.subject.insideInformation": {"$ne": None}},
        {"user.attributes_tree.contract.insideInformation": {"$ne": None}}
    ]}
  ]
}

res = documents_collection.find(filter=query, 
                                sort=sorting,
                                projection={'_id': True, 'user.updateDate':True}

                               ).limit(1000)

res_inside = list([i for i in res])

_s = f"#### Всего документов с инсайдом  {len(res_inside)}"
display(Markdown(_s))

if SELF_TEST:
    res_inside[:5]

### Вынимаем размеченные людьми инсайды из базы

In [None]:
lines=[]
for k, oid in enumerate([i["_id"] for i in res_inside]  ):
    d = get_doc_by_id(oid)
    jd = DbJsonDoc(d)
    tree=jd.user['attributes_tree']
    c = tree.get('contract', {})
    ins = c.get('insideInformation') or  c.get('subject', {}).get('insideInformation', {}) 
    print('-'*100)
    print(k, ins)
#     doc=jd.asLegalDoc()
#     quote = doc[ins['span']]
    doc = jd.asLegalDoc()
    s = ins['span']
    quote = doc[s[0]: s[1]].get_text()
    print(quote)
    lines.append( [oid, s[0], s[1], quote, ins['value']]  )
    
insides = DataFrame(lines, columns=['uid', 'from','to', 'text', 'value'])
insides

### Очистка, сортировка, удаление дупликатов

In [None]:
insides = insides.drop_duplicates(subset=['text'], keep='last')
insides.sort_values(['value']).to_csv('insides.csv', index=True)

## Embedding patterns

In [None]:
from tf_support.embedder_elmo import ElmoEmbedder

embedder = ElmoEmbedder.get_instance()


In [None]:
strings = [ r.text for i,r in  insides.iterrows() ]
strings[2:8]

In [None]:
#just test
if False:
    t1 = insides.iloc[0].text
    t2 = insides.iloc[1].text
    print(t1, t1)
    embeddings = embedder.embedd_strings([t1, t2])
    print(embeddings)

In [None]:
embeddings = embedder.embedd_strings(strings)

In [None]:
print(embeddings.shape)

plt.figure(figsize=(30,10))
plt.imshow(embeddings)


## pair-wise distances of Embeddings, removing duplicates

In [None]:
from sklearn.metrics import pairwise_distances
X = embeddings
distance_matrix = pairwise_distances(X, X, metric='cosine', n_jobs=1)
distance_matrix

plt.figure(figsize=(10,10))
plt.imshow(distance_matrix)

In [None]:
_excluded = set()
_kept = set()

display(Markdown('### Одинаковые:'))
for i in range(distance_matrix.shape[0]):
    for j in range(i+1, distance_matrix.shape[0]):
        d = distance_matrix[i,j]
        if d < 0.1:
            print(i,'vs', j, ', cosine distance =',d)
            _kept.add(i)
            _excluded.add(j)
            print(i, strings[i])
            print('==')
            print(j, strings[j])
            print('-'*100)
# _excluded

print(_kept-_excluded)
print(_excluded)

In [None]:
import numpy as np
embeddings_filtered = np.delete(embeddings, list(_excluded), axis=0)
strings_filtered = np.delete(strings, list(_excluded), axis=0)
embeddings_filtered.shape

display(Markdown(f'### {embeddings_filtered.shape[0]} -- Количество паттернов после удаления одинаковых'))

In [None]:

X = embeddings_filtered
distance_matrix = pairwise_distances(X, X, metric='cosine', n_jobs=1)
 
            
plt.figure(figsize=(5,5))
plt.imshow(distance_matrix)

## Clustering, t-SNE try

In [None]:
print(insides.value.unique())
n_clusters = len(insides.value.unique())
print('n_clusters', n_clusters)

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=1300, metric="precomputed")
tsne_results = tsne.fit_transform(distance_matrix)

In [None]:
df_subset={}
df_subset['tsne-2d-one'] = tsne_results[:,0]
df_subset['tsne-2d-two'] = tsne_results[:,1]

plt.figure(figsize=(8,8))
sns.scatterplot(
    x="tsne-2d-one", y="tsne-2d-two",
    data=df_subset,
    alpha=0.8
)

## Clustering, PCA, Birch 

In [None]:
from sklearn.decomposition import PCA


pca = PCA(n_components=2)
pca_result = pca.fit_transform(embeddings_filtered)

df_subset={}
df_subset['pca2d-one'] = pca_result[:,0]
df_subset['pca2d-two'] = pca_result[:,1]


from sklearn.cluster import KMeans
from sklearn.cluster import Birch
import numpy as np

# kmeans = KMeans(n_clusters = n_clusters, random_state=0).fit(np.array(pca_result))
kmeans = Birch(n_clusters = n_clusters).fit(np.array(pca_result))


# print(len(kmeans.labels_), kmeans.cluster_centers_)
# print(len(kmeans.labels_), kmeans.labels_)

df_subset['label'] = kmeans.labels_

plt.figure(figsize=(8,8))
sns.scatterplot(
    x="pca2d-one", y="pca2d-two",
    data=df_subset,
    hue="label", palette="tab10",
    alpha=0.6, size=[200]*len(kmeans.labels_), sizes=(400, 500)
)
# print(pca_result[:3])

In [None]:
insides.to_csv('insides.csv', index=True)

In [None]:
print(len(pca_result))

## Finding cluster centers (in embedding space)

In [None]:
from colab_support.renderer import HtmlRenderer
import matplotlib as matplotlib
from IPython.core.display import display, HTML

class DemoRenderer(HtmlRenderer):
  def render_color_text(self, tokens, weights, colormap='coolwarm', print_debug=False, _range=None, separator=' '):
    html = self.to_color_text(tokens, weights, colormap, print_debug, _range, separator=separator)
    display(HTML(html))

  def to_color_text(self, tokens, weights, colormap='coolwarm', print_debug=False, _range=None, separator=' '):
    return super()._to_color_text(tokens, weights, matplotlib, colormap=colormap, _range=_range, separator=separator)

renderer_ = DemoRenderer()

if SELF_TEST:
    renderer_.render_color_text(["слово 1", "слово 2"], np.array( [1, 0]), _range=(0,1))

In [None]:
lens = [ len (s) for s in strings]
print(lens)

mean_len = int(np.mean(lens) * 1.75)
print(mean_len)

In [None]:
centroid = np.mean(embeddings_filtered, axis=0)
centroids = []
print(centroid)

for k in range(n_clusters):
    group=[]
    for i in range(len(embeddings_filtered)):
        if k == kmeans.labels_[i]:
#             print (k)
            group.append(embeddings_filtered[i])
    print(len(group))
    c = np.mean(group, axis=0)
    centroids.append(c)
    print( c )

In [None]:
from analyser.hyperparams import models_path
models_path

# Save patterns  (embeddings binary array)

In [None]:
np.save(Path(models_path) / "insides_patterns.npy",  centroids)

# Analysing sample doc

In [None]:
if SELF_TEST:    
    centroids = np.load(Path(models_path) / "insides_patterns.npy")
    print(centroids.shape)

In [None]:
if SELF_TEST:
    

    sample_id     = res_inside[0]["_id"]
    sample_db_doc = get_doc_by_id(sample_id)
    sample_j_doc  = DbJsonDoc(sample_db_doc)
    sample_doc    = sample_j_doc.asLegalDoc()


    from analyser.legal_docs import tokenize_doc_into_sentences_map
    sample_doc.sentence_map = tokenize_doc_into_sentences_map(sample_doc.tokens_map.get_full_text(), mean_len)

    print(sample_doc)

    doc_embeddings = embedder.embedd_strings(sample_doc.sentence_map.tokens)

In [None]:
plt.figure(figsize=(10,10))
plt.imshow(doc_embeddings.T)

In [None]:
def relu(x: np.ndarray, relu_th: float = 0.0) -> np.ndarray:
  _relu = x * (x > relu_th)
  return _relu


X = doc_embeddings
distance_matrix = pairwise_distances(X, centroids, metric='cosine', n_jobs=1)
# distance_matrix = relu ( ((distance_matrix * -1)+1) , _mx-0.01)

distance_matrix = (distance_matrix * -1)+1.0
distance_matrix = distance_matrix.T
plt.figure(figsize=(30,4))
plt.imshow( distance_matrix )
# plt.plot(np.array(distance_matrix.T[0]))
print(len(distance_matrix[0]))

In [None]:
eps = 0.01
threshold = 0.8 #0.9 *  distance_matrix.max()
print('threshold', threshold)
print()
sim_max=0
i_max=0
plt.figure(figsize=(30,6))
plt.plot([threshold]*len(sample_doc.sentence_map), alpha=0.4 )
for k in range(n_clusters):    
    print('-'*20)
    v = distance_matrix[k] 
    av = v #relu(v, threshold) ## attention vector
    
    ii = av.argmax()
    sim = av[ii]
    if (sim > threshold):
        plt.plot(av)
        print( f"{k}=cluster \t {av[ii]}=similarity, \n {sample_doc.sentence_map.tokens[ii]} ")
    
    if sim>sim_max:
        i_max = k
        sim_max = sim
print(sim_max, i_max)

In [None]:
relu_threshold =  0.99 *  distance_matrix.max()
renderer_.render_color_text(sample_doc.sentence_map.tokens, relu(distance_matrix[i_max], relu_threshold), _range=(0,1))

## Test no-insides DOC

In [None]:
from bson import ObjectId
from analyser.legal_docs import tokenize_doc_into_sentences_map
# doc =

if SELF_TEST:
    sample_db_doc    =  get_doc_by_id(ObjectId('60dec1f556214d9842813fcb'))    
    sample_j_doc  = DbJsonDoc(sample_db_doc)
    sample_doc    = sample_j_doc.asLegalDoc()
    
    
    print(sample_doc)
    sample_doc.sentence_map = tokenize_doc_into_sentences_map(sample_doc.tokens_map.get_full_text(), mean_len)

#     print(sample_doc)

    doc_embeddings = embedder.embedd_strings(sample_doc.sentence_map.tokens)

In [None]:
X = doc_embeddings
distance_matrix = pairwise_distances(X, centroids, metric='cosine', n_jobs=1)
# distance_matrix = relu ( ((distance_matrix * -1)+1) , _mx-0.01)

distance_matrix = (distance_matrix * -1)+1.0
distance_matrix = distance_matrix.T
plt.figure(figsize=(30,4))
plt.imshow( distance_matrix )
# plt.plot(np.array(distance_matrix.T[0]))
print(len(distance_matrix[0]))

In [None]:
eps = 0.01
threshold = 0.85 #0.9 *  distance_matrix.max()
print('threshold', threshold)
print()
sim_max=0
i_max=0
plt.figure(figsize=(30,6))
plt.plot([threshold]*len(sample_doc.sentence_map), alpha=0.4 )
for k in range(n_clusters):    
    print('-'*20)
    v = distance_matrix[k] 
    av = v #relu(v, threshold) ## attention vector
    
    ii = av.argmax()
    sim = av[ii]
    if (sim > threshold):
        plt.plot(av)
        print( f"{k}=cluster \t {av[ii]}=similarity, \n {sample_doc.sentence_map.tokens[ii]} ")
    
    if sim>sim_max:
        i_max = k
        sim_max = sim
print(sim_max, i_max)

In [None]:
relu_threshold =  0.99 *  distance_matrix.max()
renderer_.render_color_text(sample_doc.sentence_map.tokens, distance_matrix[i_max])