# Imports

In [None]:
IMPORT_FRESH_ONLY = True # re-import all if False

SELF_TEST = True

In [None]:
# !conda install sentence-transformers -y

In [None]:
# jupyter nbextension enable --py widgetsnbextension
# !conda install ipywidgets widgetsnbextension -y


In [None]:
import logging
import platform
import sys
import os
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display, Markdown
from datetime import datetime
import tensorflow as tf

logger = logging.getLogger('retrain_ipynb')
logger.setLevel(logging.DEBUG)


ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(levelname)s - %(asctime)s - %(name)s - %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
logger.debug('--=logging started=--')

print(tf.__version__)
CPU = platform.processor()
print (f'Running on CPU:{CPU}')

In [None]:
nb_dir = os.path.split(os.getcwd())[0]

if nb_dir not in sys.path:
    sys.path.append(nb_dir)

import analyser.hyperparams 
analyser.hyperparams.__file__

In [None]:
from sentence_transformers import SentenceTransformer
 
sentence_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
sentence_model.max_seq_length = 512
 

### Prepare workdir

In [None]:
from pathlib import Path

from analyser.hyperparams import work_dir, models_path  

 
  

### Imports...

In [None]:
%matplotlib inline

import pandas as pd
from pandas import DataFrame

from integration.db import get_doc_by_id
from analyser.persistence import DbJsonDoc
from integration.db import get_mongodb_connection
from pymongo import ASCENDING, DESCENDING

 

# Init ML Flow logging

In [None]:
import mlflow
import gpn_config

ml_flow_url = gpn_config.configured ('MLFLOW_URL)
mlflow.set_tracking_uri(ml_flow_url)
print(f'{ml_flow_url=}', 'set MLFLOW_URL env var to re-define')

mlflow.set_experiment("Обучение детектора инсайдерской информации")


active_mlflow_run = mlflow.start_run(nested=False)

# Read patterns CSV

In [None]:
# models_path

In [None]:
csv_path = Path(models_path)/'insides_keyphrases.csv'
csv_path.is_file()

In [None]:
insides_keyphrases_df = pd.read_csv(csv_path,  sep=';')
insides_keyphrases_df

In [None]:
insides_keyphrases_df = insides_keyphrases_df.iloc[:, 0:2]
insides_keyphrases_df

keyphrases = [ str(r[1]).strip() for i,r in insides_keyphrases_df.iterrows() if pd.notnull(r[1]) ]
keyphrases += [ str(r[0]).strip() for i,r in insides_keyphrases_df.iterrows() if pd.notnull(r[0]) ]


In [None]:
keyphrases[:20]

mlflow.log_param('n_samples_file', len(keyphrases))

### Import docs having insideInformation from DB

In [None]:

db = get_mongodb_connection()
documents_collection = db['documents']
sorting = [('user.updateDate', DESCENDING), ('analysis.analyze_timestamp', DESCENDING)]

query = {
  '$and': [
#     {"parse.documentType": "CONTRACT"},      
#     {"state": 15},
    {'$or': [
        {"user.attributes_tree.contract.subject.insideInformation": {"$ne": None}},
        {"user.attributes_tree.contract.insideInformation": {"$ne": None}}
    ]}
  ]
}

res = documents_collection.find(filter=query, 
                                sort=sorting,
                                projection={'_id': True, 'user.updateDate':True}

                               ).limit(5000)

res_inside = list([i for i in res])

_s = f"#### Всего документов с инсайдом  {len(res_inside)}"
display(Markdown(_s))

if SELF_TEST:
    res_inside[:5]

### Вынимаем размеченные людьми инсайды из базы

In [None]:
lines=[]
for k, oid in enumerate([i["_id"] for i in res_inside]  ):
    d = get_doc_by_id(oid)
    jd = DbJsonDoc(d)
    tree=jd.user['attributes_tree']
    c = tree.get('contract', {})
    ins = c.get('insideInformation') or  c.get('subject', {}).get('insideInformation', {}) 
    # print('-'*100)
    # print(oid, k, ins)
#     doc=jd.asLegalDoc()
#     quote = doc[ins['span']]
    doc = jd.asLegalDoc()
    s = ins['span']
    quote = doc[s[0]: s[1]].get_normal_text()
    # print(k, quote)
    lines.append( [oid, s[0], s[1], quote, ins['value']]  )
    
insides = DataFrame(lines, columns=['uid', 'from','to', 'text', 'value'])
insides

In [None]:
mlflow.log_param('n_samples_db', len(insides))

## List TOO long samples

In [None]:
print()
for i,r in  insides.iterrows():
    if len(r.text) > 500:
        print(  f'http://gpn-audit.nemosoft.ru/#/pre-audit/edit/{r.uid}', '\n\t' ,r.text)
        print()

In [None]:
# !git checkout {models_path}/
# !which python

# !pip install spacy==3.1.2


## Split samples into pieces

In [None]:
from __future__ import unicode_literals, print_function

if True:
    from spacy.lang.ru import Russian # updated

    # raw_text = 'Hello, world. Here are two sentences.'
    nlp = Russian()
    # nlp.add_pipe(nlp.create_pipe('sentencizer')) # updated

    nlp.add_pipe('sentencizer')
    # doc = nlp(raw_text)
    # sentences = [sent  for sent in doc.sents]
    # sentences

    # sentences = [sent.string.strip() for sent in doc.sents]


### Очистка, сортировка, удаление дупликатов

In [None]:
from analyser.hyperparams import HyperParameters

mean_len = HyperParameters.mean_sentense_pattern_len
print(mean_len)

insides = insides.drop_duplicates(subset=['text'], keep='last')
insides.sort_values(['value']).to_csv(Path(models_path)/'insides.csv', index=True)
print(f'{len(insides)=}')


### Удаление слишком коротких

In [None]:
from analyser.legal_docs import tokenize_doc_into_sentences_map

strings = [ r.text for i,r in  insides.iterrows() ]


strings_ =[]
nonsense = []
for j, s in enumerate(strings):
    doc = nlp(s)
#     m = tokenize_doc_into_sentences_map(s, max_len_chars = HyperParameters.mean_sentense_pattern_len)
#     print('='*100)
#     print(m.tokens)
    for i in doc.sents:     
        sent = str(i).strip()
        if len(sent) > 400:    
            print('*'*100)
            print(f'{len(sent)}\t {insides.iloc[j].uid}\t {sent}')
        if len(sent) > 12:            
            strings_.append(sent)
        else:
            nonsense.append(sent)
#             print ('- excl:', sent)
        
strings = strings_



strings+=[ 'Заказчик передает Исполнителю данные аккаунтов (логин и пароль) в социальных сетях',
         'если особенностями процесса оказания услуг обусловлена необходимость доступа персонала Исполнителя к \
         электронным информационным системам, программам и базам данных, Заказчик гарантирует предоставление Исполнителю \
         такого доступа в соответствии с локальными нормативными актами Заказчика и при прохождении процедур инструктажа, \
         установленных Заказчиком, не противоречащих действующему законодательству, в течение первых двух дней выездной аудиторской проверки',
         'Заказчик обязуется предоставить Исполнителю окончательный вариант годового отчета и/или отчета эмитента (или иной информации), включая совокупность документов, \
         совместно составляющих годовой отчет, чтобы Исполнитель мог завершить процедуры, необходимые в соответствии с требованиями MCA']


nonsense.append('Губкина, дом 1')
nonsense.append('без всякого смысла')
nonsense.append(' ')
nonsense.append('non abra kadabra plus')
nonsense.append('gazprom-neft.')

nonsense += ['за заданный исторический период.',
             'Губкина, дом 1 (далее - «Объект»)', 
             'Строительство производства катализаторов 2 этап», находящихся по адресу: 644040, г.',
             'поставку Покупателю экземпляров программного обеспечения jFrog Artifactory Pro X Edition',
             'Строительство производства катализаторов'
            ]



strings+=keyphrases

In [None]:
print(nonsense)

## Vectorisation des modèles

In [None]:
 

embedd_strings = sentence_model.encode



### Trouver le centre d'absurdité (фр. поиск центра абсурда)

In [None]:
%%time
nonsense_embeddings = embedd_strings(nonsense)
center_of_nonsense = nonsense_embeddings.mean(axis=0)
center_of_nonsense.shape

In [None]:
%%time
# embeddings = embedder.embedd_strings(strings) #WARNing! these are context-scpecific

embeddings_l = []
for s in strings:
    e = embedd_strings([s])
    embeddings_l.append(e[0])
    

In [None]:
%%time
import numpy as np
embeddings = np.array(embeddings_l)
print(embeddings.shape, embeddings.max(), embeddings.min())

plt.figure(figsize=(20,10))
plt.imshow(embeddings)


## pair-wise distances of Embeddings, removing duplicates

In [None]:
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import normalize

distance_matrix = pairwise_distances(embeddings, embeddings, metric='cosine', n_jobs=1)
print(distance_matrix.shape)
# distance_matrix

plt.figure(figsize=(10,10))
plt.imshow(distance_matrix)
print('distance_matrix mean', distance_matrix.mean())
print('distance_matrix max', distance_matrix.max())
print('distance_matrix std', distance_matrix.std())

In [None]:
nonsense_distance_matrix = pairwise_distances(embeddings, [center_of_nonsense], metric='cosine', n_jobs=1)
plt.figure(figsize=(20,20))
plt.imshow(nonsense_distance_matrix.T)

In [None]:
# len(X)

In [None]:
_excluded = set()
_kept = set()

_b=0.7
nonsense_thr = nonsense_distance_matrix.mean() * _b + nonsense_distance_matrix.min() * (1-_b)
print('nonsense_thr', nonsense_thr)

display(Markdown(f'### Бессмысленные паттерны или почти:: \
        distance < {nonsense_thr:.2} (min={nonsense_distance_matrix.min():.2}; mean={nonsense_distance_matrix.mean():.2})'))
for i, nd in enumerate(nonsense_distance_matrix):
    d = nd[0]
    if d < nonsense_thr:
        ## trop proche de l'absurde
        print('---')
        _excluded.add(i)
        print(i, d, '\t',strings[i])

In [None]:
np.percentile(distance_matrix, 0.5)

In [None]:
simlarity_threshold = np.percentile(distance_matrix, 0.5)
display(Markdown(f'### Одинаковые: расстояние < {simlarity_threshold:.2}'))
for i in range(distance_matrix.shape[0]):
    for j in range(i+1, distance_matrix.shape[0]):
        
        d = distance_matrix[i,j]
        
        if d < simlarity_threshold:
            print(i,'vs', j, ', cosine distance =', d)
            _kept.add(i)
            _excluded.add(j)
            

            print(i, strings[i])
            print('='*10)
            print(j, strings[j])
            print('-'*100)
       
# _excluded
display(Markdown(f'#### {len(_excluded)} одинаковых пар'))
print(_kept - _excluded)
print(_excluded)

In [None]:
# for i in range(distance_matrix.shape[0]):
#     if len(strings[i]) < 15:
#         print(strings[i])
#     if len(strings[i]) > 400 :
#         print('-'*100)
#         print(strings[i])
        
#         _excluded.add(i)

In [None]:
import numpy as np
embeddings_filtered = np.delete(embeddings, list(_excluded), axis=0)
strings_filtered = np.delete(strings, list(_excluded), axis=0)
print(embeddings_filtered.shape)

display(Markdown(f'### {embeddings_filtered.shape[0]} -- Количество паттернов после удаления одинаковых'))


del embeddings
del strings

strings = strings_filtered
embeddings = embeddings_filtered

mlflow.log_param('n_samples', embeddings_filtered.shape[0])

In [None]:
# import pandas as pd
 
# for
# df_describe = pd.DataFrame(distance_matrix)
# print("distance_matrix mean=", distance_matrix.mean())
# print("distance_matrix std=", distance_matrix.std())
# df_describe.describe()


#collect all elements except diagonal zeros
#XXX: mighe be done with smatr sliceing, but im too lazy


def estimate_distance_threshold(patterns_embeddings):
    distance_matrix = pairwise_distances(patterns_embeddings, patterns_embeddings, metric='cosine', n_jobs=1)
    
    dshape = distance_matrix.shape
    distance_matrix_meaningful=[]
    for i in range(dshape[0]):
        for j in range(i):
            distance_matrix_meaningful.append(distance_matrix[i][j])
    distance_matrix_meaningful=np.array(distance_matrix_meaningful)
    len(distance_matrix_meaningful)            

    print("distance_matrix mean=", distance_matrix_meaningful.mean())
    print("distance_matrix max=", distance_matrix_meaningful.max())
    print("distance_matrix std=", distance_matrix_meaningful.std())

    #mean distance plus/minus tandart deviation .. estimating the max distance from clusters...
    threshold = distance_matrix_meaningful.mean() - distance_matrix_meaningful.std() 
    return threshold
    
estimate_distance_threshold(embeddings_filtered)

## Clustering, t-SNE  3D

In [None]:

distance_matrix = pairwise_distances(embeddings_filtered, embeddings_filtered, metric='cosine', n_jobs=1)

            
plt.figure(figsize=(7,7))
plt.imshow(distance_matrix)

In [None]:
# print(insides.value.unique())????
n_clusters = embeddings_filtered.shape[0] // 10 #len(insides.value.unique())
print('n_clusters', n_clusters)

## Clustering, t-SNE  2D

In [None]:
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, DBSCAN
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=6000, metric="precomputed")
tsne_results = tsne.fit_transform(distance_matrix)

In [None]:
# from sklearn.cluster import SpectralClustering

# kmeans = KMeans(n_clusters = n_clusters, random_state=0).fit(np.array(tsne_results))
kmeans = DBSCAN(eps=1.1, min_samples=3 ).fit(tsne_results)
# kmeans = SpectralClustering(n_clusters=n_clusters, assign_labels='discretize', random_state=0).fit(tsne_results)

df_subset={}
df_subset['tsne-2d-one'] = tsne_results[:,0]
df_subset['tsne-2d-two'] = tsne_results[:,1]
df_subset['label'] = kmeans.labels_


fig=plt.figure(figsize=(12,8) )
# ax = fig.add_subplot(111, projection = '3d')


# ax.scatter(df_subset['tsne-2d-one'], df_subset['tsne-2d-two'], df_subset['tsne-2d-3'])

# plt.show()

sns.scatterplot(
    x="tsne-2d-one", y="tsne-2d-two",
    hue="label", palette="tab10",
    data=df_subset,
    alpha=0.8
)



In [None]:
kmeans.__dict__

In [None]:
# set(kmeans.labels_)

In [None]:
# !conda list

In [None]:
import umap
umap_embeddings = (umap.UMAP(n_neighbors=6, 
                                n_components=3, 
                                metric='cosine', 
                                random_state=42)
                            .fit_transform(embeddings_filtered))


fig=plt.figure(figsize=(10,10) )
ax = fig.add_subplot(111, projection = '3d')

ax.scatter(umap_embeddings[:,0], umap_embeddings[:,1], umap_embeddings[:,2], s=40, c=df_subset['label'] )
 
plt.show()

In [None]:
from matplotlib.colors import ListedColormap
 
cmap = ListedColormap(sns.color_palette("husl", 256).as_hex())


# from sklearn.cluster import Birch

from sklearn.manifold import TSNE
tsne = TSNE(n_components=3, verbose=1, perplexity=30, n_iter=5000, metric="precomputed")
tsne_results3 = tsne.fit_transform(distance_matrix)




tsne_results3 = umap_embeddings #XXXXXX



# clusters3d = KMeans(n_clusters = n_clusters, random_state=0).fit(np.array(tsne_results))
clusters3d = DBSCAN(eps=1.2, min_samples=3).fit(tsne_results)

print('clusters3d.labels_',set(clusters3d.labels_))
df_subset={}
df_subset['tsne-2d-one'] = tsne_results3[:,0]
df_subset['tsne-2d-two'] = tsne_results3[:,1]
df_subset['tsne-2d-3'] = tsne_results3[:,2]
df_subset['label'] = clusters3d.labels_


fig = plt.figure(figsize=(10,10) )
ax = fig.add_subplot(111, projection = '3d')

ax.scatter(df_subset['tsne-2d-one'], df_subset['tsne-2d-two'], df_subset['tsne-2d-3'], s=40, c=df_subset['label'] )
 
plt.show()

mlflow.log_figure(fig, 'clusters_of_samples_(TSNE_DBSCAN).png')

In [None]:
insides.to_csv('insides.csv', index=True)

In [None]:
# print(len(pca_result))

In [None]:
from colab_support.renderer import HtmlRenderer
import matplotlib as matplotlib
from IPython.core.display import display, HTML

class DemoRenderer(HtmlRenderer):
  def render_color_text(self, tokens, weights, colormap='coolwarm', print_debug=False, _range=None, separator=' '):
    html = self.to_color_text(tokens, weights, colormap, print_debug, _range, separator=separator)
    display(HTML(html))

  def to_color_text(self, tokens, weights, colormap='coolwarm', print_debug=False, _range=None, separator=' '):
    return super()._to_color_text(tokens, weights, matplotlib, colormap=colormap, _range=_range, separator=separator)

renderer_ = DemoRenderer()

# if SELF_TEST:
#     renderer_.render_color_text(["слово 1", "слово 2"], np.array( [1, 0]), _range=(0,1))

## Finding cluster centers (in embedding space)

In [None]:
lens = [ len (s) for s in strings]
print(lens)

__mean_len = int(np.mean(lens) * 1.75)
print(__mean_len)

In [None]:
# kmeans.__dict__

In [None]:
# centroid = np.mean(embeddings_filtered, axis=0)
# del centroid
centroids = []
# print(centroid)


kmeans = clusters3d
# for g in set(kmeans.labels_):
#     group=[]
#     for i in range(len(embeddings_filtered)):
#         if g == kmeans.labels_[i]:
#             if g==-1:
#                 print(k, strings[i],'\n')
#             group.append(embeddings_filtered[i])
# #             print(k, strings[i])
# #             print('-'*100)
#     print(len(group), g)
#     print('='*100)
#     c = np.mean(group, axis=0)
#     centroids.append(c)
#     print( c )

In [None]:
centroids = []
for k in range(n_clusters):
    group=[]
    for i in range(len(embeddings_filtered)):
        if k == kmeans.labels_[i]:
#             print (k)
            group.append(embeddings_filtered[i])
            print(f'label={k}({i}) \t [{strings[i]}]')
            print('-'*20)
            
    print(f'\n {k} len={len(group)}' )
    print('='*100)
    print(f'\n\n ' )
    
    if len(group)>0:
        centroid = np.mean(group, axis=0)
#         print(centroid)
        centroids.append(centroid)
    else:
        print(f'ACHTUNG group {k} is empty')


In [None]:
# centroids
n_clusters = len(centroids)
n_clusters
mlflow.log_param('n_clusters', n_clusters)

# Save patterns  (embeddings binary array)

In [None]:
# # define a custom model
# class MyModel(mlflow.pyfunc.PythonModel):
#     def predict(self, context, model_input):
#         return self.my_custom_function(model_input)

#     def my_custom_function(self, model_input):
#         # do something with the model input
#         return 0
    
    


_fn = Path(models_path) / "insides_patterns.npy"
np.save(_fn, centroids)



# model_info = mlflow.pyfunc.log_model(artifact_path="model", python_model=centroids)

mlflow.log_artifact(_fn)
# mlflow.register_model()



# Analysing sample doc

In [None]:
if SELF_TEST:    
    centroids = np.load(Path(models_path) / "insides_patterns.npy")
    print(centroids.shape)
    
distance_threshold = estimate_distance_threshold(centroids)
print('distance_threshold', distance_threshold)

In [None]:
%%time
from bson import ObjectId
if SELF_TEST:
    from analyser.legal_docs import tokenize_doc_into_sentences_map

    sample_id     = ObjectId('617146dd8fd5658689eb2bb4')  #res_inside[0]["_id"] #ObjectId('6166989787f1d9065bb8b914') #
    sample_db_doc = get_doc_by_id(sample_id)
    print('doc has been fetched from mongo')
    sample_j_doc  = DbJsonDoc(sample_db_doc)
    sample_doc    = sample_j_doc.asLegalDoc()
    
    print(sample_id)



    sample_doc.sentence_map = tokenize_doc_into_sentences_map(sample_doc.tokens_map.get_full_text(), 
                                                              HyperParameters.mean_sentense_pattern_len)
    #TODO: might be employ 3rd party lib
    print(sample_doc)

    doc_embeddings = embedd_strings(sample_doc.sentence_map.tokens)

In [None]:
len(sample_doc)

In [None]:
mean_len

In [None]:
plt.figure(figsize=(15,15))
plt.imshow(doc_embeddings)

In [None]:
distance_matrix = pairwise_distances(centroids, doc_embeddings , metric='cosine', n_jobs=1)
distance_matrix = (distance_matrix * -1) + 1.0
 
plt.figure(figsize=(30,4))
plt.imshow( distance_matrix )
# plt.plot(np.array(distance_matrix.T[0]))
print(len(distance_matrix[0]))
 
print( distance_matrix.min())
print( distance_matrix.shape)
print( distance_matrix.argmax())

In [None]:
def relu(x: np.ndarray, relu_th: float = 0.0) -> np.ndarray:
  _relu = x * (x > relu_th)
  return _relu


In [None]:
eps = 0.01
# del threshold # = 0.7 #0.9 *  distance_matrix.max()
print('distance_threshold', distance_threshold)
print()
sim_max = 0
# i_max = 0

plt.figure(figsize=(30,6))
plt.plot([distance_threshold]*len(sample_doc.sentence_map), alpha=0.4 )
i_max = 0
for k in range(n_clusters):    
    print('-'*20)
    av = distance_matrix[k] 
    
    ii = av.argmax()
    sim = av[ii]
    if (sim > distance_threshold):
        plt.plot(av)
        print( f"{k}=cluster \t {av[ii]}=similarity, \n {sample_doc.sentence_map.tokens[ii]} ")
        _span = sample_doc.sentence_map.remap_span((ii, ii + 1), sample_doc.tokens_map)
        print(_span)
    
    if sim > sim_max:
        i_max = k
        sim_max = sim
        
print(sim_max, i_max)

In [None]:
relu_threshold =  0.85 *  distance_matrix.max()
renderer_.render_color_text(sample_doc.sentence_map.tokens, relu(distance_matrix[i_max], relu_threshold), _range=(0,1), separator='<br>')

## Test no-insides DOC

In [None]:
%%time
from bson import ObjectId

# doc =

if SELF_TEST and False:
    sample_db_doc    =  get_doc_by_id(ObjectId('60dec1f556214d9842813fcb'))    
    sample_j_doc  = DbJsonDoc(sample_db_doc)
    sample_doc    = sample_j_doc.asLegalDoc()
    
    
    print(sample_doc)
    sample_doc.sentence_map = tokenize_doc_into_sentences_map(sample_doc.tokens_map.get_full_text(), mean_len)

#     print(sample_doc)

    doc_embeddings = embedd_strings(sample_doc.sentence_map.tokens)

In [None]:
if SELF_TEST and False:
    X = doc_embeddings
    distance_matrix = pairwise_distances(X, centroids, metric='cosine', n_jobs=1)
    # distance_matrix = relu ( ((distance_matrix * -1)+1) , _mx-0.01)

    distance_matrix = (distance_matrix * -1)+1.0
    distance_matrix = distance_matrix.T
    plt.figure(figsize=(30,4))
    plt.imshow( distance_matrix )
    # plt.plot(np.array(distance_matrix.T[0]))
    print(len(distance_matrix[0]))

In [None]:
if SELF_TEST and False:
    eps = 0.01
    threshold = 0.85 #0.9 *  distance_matrix.max()
    print('threshold', threshold)
    print()
    sim_max=0
    i_max=0
    plt.figure(figsize=(30,6))
    plt.plot([threshold]*len(sample_doc.sentence_map), alpha=0.4 )
    for k in range(n_clusters):    
        print('-'*20)
        v = distance_matrix[k] 
        av = v #relu(v, threshold) ## attention vector

        ii = av.argmax()
        sim = av[ii]
        if (sim > threshold):
            plt.plot(av)
            print( f"{k}=cluster \t {av[ii]}=similarity, \n {sample_doc.sentence_map.tokens[ii]} ")

        if sim>sim_max:
            i_max = k
            sim_max = sim
    print(sim_max, i_max)

In [None]:
if SELF_TEST and False:
    relu_threshold =  0.99 *  distance_matrix.max()
    renderer_.render_color_text(sample_doc.sentence_map.tokens, distance_matrix[i_max])

# End mlflow run

In [None]:
print(active_mlflow_run.info)

print('see results at')
print(f'{mlflow.get_registry_uri()}/#/experiments/{active_mlflow_run.info.experiment_id}/runs/{active_mlflow_run.info.run_id}')


mlflow.end_run()