In [None]:
import logging
import sys
import os


logger = logging.getLogger('eva;_ipynb')
logger.setLevel(logging.DEBUG)


ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter('%(levelname)s - %(asctime)s - %(name)s - %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
logger.debug('--=logging started=--')

In [None]:
IN_COLAB = 'google.colab' in sys.modules
print ('Running in colab:', IN_COLAB)

In [None]:
if not IN_COLAB:
  nb_dir = os.path.split(os.getcwd())[0]
  if nb_dir not in sys.path:
      sys.path.append(nb_dir)
 

In [None]:
import analyser.hyperparams
analyser.hyperparams.work_dir


# Imports

In [None]:
%matplotlib inline

 
import numpy as np
import pandas as pd

 
from trainsets.retrain_contract_uber_model import UberModelTrainsetManager
from tf_support.super_contract_model import semantic_map_keys_contract



In [None]:
from colab_support.renderer import HtmlRenderer
import matplotlib as matplotlib
from IPython.core.display import display, HTML

class DemoRenderer(HtmlRenderer):
  def render_color_text(self, tokens, weights, colormap='coolwarm', print_debug=False, _range=None, separator=' '):
    html = self.to_color_text(tokens, weights, colormap, print_debug, _range, separator=separator)
    display(HTML(html))

  def to_color_text(self, tokens, weights, colormap='coolwarm', print_debug=False, _range=None, separator=' '):
    return super()._to_color_text(tokens, weights, matplotlib, colormap=colormap, _range=_range, separator=separator)

renderer_ = DemoRenderer()


# renderer_.render_color_text(["слово 1", "слово 2"], np.array( [1, 0]), _range=(0,1))

# Prepare model


In [None]:
from pathlib import Path
work_dir = Path(analyser.hyperparams.work_dir)
print(work_dir)

In [None]:

umtm = UberModelTrainsetManager (work_dir)
umtm.load_contract_trainset_meta()
stats = umtm.stats
stats['sample_weight']=-1.0
stats['subject_weight']=-1.0
stats

# look into trainset (take a sample)

In [None]:
from tf_support.super_contract_model import  validate_datapoint

stats['valid'] = True
stats['error'] = ''

for i in stats.index:
  
  try:
    validate_datapoint(str(i), stats)

  except Exception as e:
    logger.error(e)

    stats.at[i, 'valid'] = False
    stats.at[i, 'error'] = str(e)
    
stats

In [None]:
stats_valid = stats[stats['valid']]
# stats_valid = stats_valid[stats_valid.source=='file']
len(stats_valid)

In [None]:
import json
import re
from bson import json_util

fn = work_dir / 'documents.json'
with open(fn) as file:
    file_data = json.load(file, object_hook=json_util.object_hook)    
    print(f'total docs in {fn} is {len(file_data)}')    
    

In [None]:
from analyser.persistence import DbJsonDoc
docs = {}
for fd in file_data:
    try:
      validate_datapoint(str(fd['_id']), stats)
      docs [fd['_id']] =  DbJsonDoc(fd)
      print (fd['_id'])
    except Exception as e:
      logger.error(e)

In [None]:
print(f'total docs in {fn} is {len(list(docs.values()))}')   

## Get sample doc

In [None]:
_DEBUG = True

if _DEBUG:
    a_doc_from_json = list(docs.values())[9]




print(a_doc_from_json.get_tokens_map_unchaged().text[:2300])

In [None]:
a_doc_from_json.get_attributes_tree()

In [None]:
%matplotlib inline
from colab_support.renderer import plot_embedding, plot_cm
from tf_support.super_contract_model import make_xyw

SAMPLE_DOC_ID = str(a_doc_from_json.get_id())# stats_valid.index[0]

print('SAMPLE_DOC_ID', SAMPLE_DOC_ID)

(emb, tok_f), (sm, subj), (sample_weight, subject_weight) = make_xyw(SAMPLE_DOC_ID, stats)
 
    
print('semantic map shape is:', sm.shape)
_crop = 700
plot_embedding(tok_f[:_crop], title=f'Tokens features {SAMPLE_DOC_ID}') 
plot_embedding(emb[:_crop], title=f'Embedding {SAMPLE_DOC_ID}') 
plot_embedding(sm[:_crop], title=f'Semantic map {SAMPLE_DOC_ID}')


# Models 🦖

## uber_detection_model_005


In [None]:
from tf_support.super_contract_model import uber_detection_model_005_1_1
from tf_support.super_contract_model import uber_detection_model_003

model_factory_fn = uber_detection_model_005_1_1

In [None]:


from tf_support.tools import KerasTrainingContext
from sklearn.model_selection import train_test_split

BATCH_SIZE = 2

_train, _test = train_test_split(stats_valid, test_size=0.2, stratify=stats_valid[['subject']])

train_indices = list(_train.index)
test_indices = list(_test.index)

ctx = KerasTrainingContext(umtm.work_dir, session_index=21)
ctx.EVALUATE_ONLY = True
ctx.set_batch_size_and_trainset_size(BATCH_SIZE, 
                                     len(test_indices), 
                                     4 * len(train_indices))


# model_factory_fn = uber_detection_model_005_1_1



# weights = Path(models_path) / f"{model_factory_fn.__name__}-{keras.__version__}.h5"
weights = ctx.model_checkpoint_path / f'{model_factory_fn.__name__}.h5'
if weights.is_file():
    print (weights)
    
umodel = ctx.init_model(model_factory_fn, trained=True, trainable=True, weights=weights)
umodel.trainable = False
umodel.summary()

# Evaluate models

In [None]:
# sample_index = umtm.stats [umtm.stats['value']>0].index[2]
print(SAMPLE_DOC_ID)


x, y, _ = (emb, tok_f), (sm, subj), (sample_weight, subject_weight)# = umtm.make_xyw(SAMPLE_DOC_ID)

# x, y, _ = umtm.make_xyw(sample_index)

prediction = umodel.predict(x=[np.expand_dims(x[0], axis=0), np.expand_dims(x[1], axis=0)], batch_size=1)

tagsmap = pd.DataFrame(prediction[0][0], columns=semantic_map_keys_contract)
tagsmap_e = pd.DataFrame(sm, columns=semantic_map_keys_contract)
# .T
plot_embedding(tagsmap[:_crop], f'Predicted Semantic Map {tagsmap.shape}')
plot_embedding(tagsmap[:_crop] - tagsmap_e[:_crop], title=f'DELTA Semantic map {tagsmap_e.shape}')
plot_embedding(tagsmap_e[:_crop], title=f'EXPECTED Semantic map {tagsmap_e.shape}')

In [None]:
# tagsmap.sum(axis=1)

In [None]:
av = tagsmap.sum(axis=1) #tagsmap['amount-begin'] + tagsmap['vat-begin'] + tagsmap['number-begin'] + tagsmap['org-name-begin']

# av = tagsmap.sum(axis=1)
renderer_.render_color_text(a_doc_from_json.get_tokens_map_unchaged().tokens[:1600], av[:1600])

In [None]:
from analyser.documents import TextMap
from analyser.ml_tools import SemanticTag

 

## Getting tag values from inferred semantic map

In [None]:
from analyser.contract_parser import nn_find_org_names, nn_get_subject, nn_get_contract_number, nn_get_contract_date, nn_get_tag_values
from analyser.parsing import AuditContext

ac = AuditContext()

#### Orgs

In [None]:
cas = nn_find_org_names(a_doc_from_json.get_tokens_map_unchaged(), tagsmap, ac)
print(cas[0].name)
print(cas[0].type)
print(cas[0].alias)
print()
print(cas[1].name)
print(cas[1].type)
print(cas[1].alias)

In [None]:
tag = nn_get_tag_values('org-type',  a_doc_from_json.get_tokens_map_unchaged(), tagsmap, max_tokens=12, threshold=0.5, limit=2)
for t in tag:
  print("-"*100)
  print(t)

In [None]:
tag = nn_get_tag_values('org-name',  a_doc_from_json.get_tokens_map_unchaged(), tagsmap, max_tokens=12, threshold=0.5, limit=2)
for t in tag:
  print("-"*100)
  print(t)

In [None]:
tag = nn_get_tag_values('org-alias',  a_doc_from_json.get_tokens_map_unchaged(), tagsmap, max_tokens=4, threshold=0.9, limit=2)


for t in tag:
  print("-"*100)
  print(t)

#### Date/number

In [None]:
tag = nn_get_tag_values('date',  a_doc_from_json.get_tokens_map_unchaged(), tagsmap, max_tokens=6, threshold=0.3, limit=1, return_single=True)
print(tag)

In [None]:
tag = nn_get_tag_values('number',  a_doc_from_json.get_tokens_map_unchaged(), tagsmap, max_tokens=5, threshold=0.3, limit=1, return_single=True)
print(tag)

In [None]:
date_tag = nn_get_contract_number(a_doc_from_json.get_tokens_map_unchaged(), tagsmap) 
number_tag = nn_get_contract_date(a_doc_from_json.get_tokens_map_unchaged(), tagsmap)
print( date_tag)
print( number_tag )

#### Amount

In [None]:
textmap = a_doc_from_json.get_tokens_map_unchaged()

In [None]:
from pandas import DataFrame
from analyser.schemas import ContractPrice, merge_spans
from analyser.legal_docs import find_value_sign
from analyser.transaction_values import ValueSpansFinder
from analyser.text_tools import to_float


#---
cps = nn_find_contract_value(textmap, tagsmap)
if cps:
  print(str(cps[0].get_span()))
  for k in cps[0].list_children():
    print(str(k))

  print()
  print()

  print('brutto', str(cps[0].amount_brutto))
  print('netto', str(cps[0].amount_netto))
  print('amount', str(cps[0].amount))
  print('vat', str(cps[0].vat))
else:
  print('nothing')

In [None]:
# tags = []

# # tags.append()
# tags.append(nn_get_tag_value('sign', textmap,          tagsmap, max_tokens=10, threshold=0.4, limit=1))
# tags.append(nn_get_tag_value('currency',textmap,       tagsmap, max_tokens=4, threshold=0.4, limit=1))
# tags.append(nn_get_tag_value('amount_brutto', textmap, tagsmap, max_tokens=4, threshold=0.4, limit=1))
# tags.append(nn_get_tag_value('amount_netto', textmap,  tagsmap, max_tokens=4, threshold=0.4, limit=1))
# tags.append(nn_get_tag_value('value', textmap,         tagsmap, max_tokens=40, threshold=0.02, limit=1))
# for tag in tags:
#   print("-"*100)
#   for t in tag:
#     print(t)

#### Subject

In [None]:
import matplotlib.pyplot as plt
tag_name = 'subject-end'
attention =  tagsmap[tag_name].values


plt.figure(figsize=(20,5))
plt.plot(attention)
# plot_embedding(att[:400], title=f'{tag_name}')

In [None]:
subject_tag = nn_get_tag_values('subject',  a_doc_from_json.get_tokens_map_unchaged(), tagsmap, max_tokens=200, threshold=0.02, limit=1, return_single=True)

print(subject_tag)

### Draw tags

In [None]:
tags_hl = np.zeros(len(textmap))

def hl(tag):
  try:
    tags_hl [ tag.span[0]:tag.span[1]] +=1
  except:
    pass
  

if cps:
  hl(cps[0].amount_brutto)
  hl(cps[0].amount_netto)
  hl(cps[0].amount)
  hl(cps[0].vat)
  hl(cps[0].sign)
  hl(cps[0].currency)
  hl(cps[0])


hl(cas[0].name)
hl(cas[0].type)
hl(cas[0].alias)

hl(cas[1].name)
hl(cas[1].type)
hl(cas[1].alias)

hl(number_tag)
hl(date_tag)


hl(subject_tag)

renderer_.render_color_text(a_doc_from_json.get_tokens_map_unchaged().tokens[:1600], tags_hl[:1600])

In [None]:

# agent_tags = ['org-1-name',
#               'org-1-type',
#               'org-1-alias',
#               'org-2-name',
#               'org-2-type',
#               'org-2-alias']
# solo_tags = [
#   'date',
#   'number',
#   'sign_value_currency/value',
#   'sign_value_currency/currency',
#   'sign_value_currency/sign'
# ]

# # seq_labels_contract[-3:]

# tagnames = solo_tags + agent_tags


# from pandas import DataFrame

# from analyser.contract_agents import ContractAgent, normalize_contract_agent

# from analyser.persistence import DbJsonDoc
# from analyser.text_tools import find_top_spans
# # from tf_support.super_contract_model import seq_labels_contract
# from tf_support.tf_subject_model import decode_subj_prediction

# from analyser.contract_parser import nn_find_org_names, nn_get_tag_value




# def fetch_tags_from_predicted_semantic_map(_id: str, tagsmap: DataFrame):
#   jdoc = get_doc(_id)
#   _map = jdoc.get_tokens_map_unchaged()

#   results = {}
#   for key in tagnames:
#     t = nn_get_tag_value(key, _map, tagsmap )
#     results[key] = t
#     # print(t)

# #   ca = ContractAgent()
# #   ca.name =  results['org-1-name'] #TODO: check for NONE
# #   ca.type =  results['org-1-type']
# #   ca.alias = results['org-1-alias']
   

# #   ca2 = ContractAgent()
# #   ca2.name =  results['org-2-name'] #TODO: check for NONE
# #   ca2.type =  results['org-2-type']
# #   ca2.alias = results['org-2-alias']
# #   try:
# #     normalize_contract_agent(ca)
# #     normalize_contract_agent(ca2)
# #   except Exception as e:
# #         # TODO:
# #     logger.error(f'{_id} {e}')

#   if results['number'] is not None:
#     results['number'].value = results['number'].value.strip().lstrip('№').lstrip('N ').lstrip()

#   return results, jdoc


# def put_results_into_df(id_, results, df, jdoc: DbJsonDoc):
#   org_atribs = ['name', 'alias', 'type']

#   def v(x):
#     if results[x] is not None:
#       return results[x].value

#   def swap(a, b):
#     ab = [a, b]
#     try:
#       ab = sorted(ab)
#     except:
#       pass
#     return ab

#   def s(a, b):
#     ab = swap(v(a), v(b))
#     df.at[id_, f'p-{a}'] = ab[0]
#     df.at[id_, f'p-{b}'] = ab[1]
#     return ab

#   for key in org_atribs:
#     arr = s(f'org-1-{key}', f'org-2-{key}')

#   def p(key):
#     df.at[id_, f'p-{key}'] = v(key)

#   p('sign_value_currency/value')
#   p('sign_value_currency/currency')
#   p('sign_value_currency/sign')

#   p('date')
#   p('number')

#   # get_expected values
#   for key in solo_tags:
#     t = jdoc.get_attribute_value(key)
#     df.at[id_, f'{key}'] = t

#   for key in org_atribs:
#     orgs = swap(jdoc.get_attribute_value(f'org-1-{key}'), jdoc.get_attribute_value(f'org-2-{key}'))
#     df.at[id_, f'org-1-{key}'] = orgs[0]
#     df.at[id_, f'org-2-{key}'] = orgs[1]



In [None]:
sample_index = umtm.stats [umtm.stats['score'] < 1000].index.values[0:500]
# print(sample_index)
from tf_support.tf_subject_model import decode_subj_prediction



def make_subj_predictions(umodel, indices):
  ev = umtm.stats.copy()
  tags = pd.DataFrame()
  
#   for t in tagnames:
#     tags['p-' + t] = ''
#     tags[t] = ''

  errors_report = pd.DataFrame()
  errors_report['expected'] = ''
  errors_report['predicted'] = ''

  for i, _id in enumerate(indices):
    logger.debug(f'validating {_id} {i} of {len(indices)}')
    
#     print (i, _id, type(ev))
    x, y, _ = make_xyw(_id, ev)

    prediction = umodel.predict(x=[np.expand_dims(x[0], axis=0), np.expand_dims(x[1], axis=0)], batch_size=1)
    tagsmap = pd.DataFrame(prediction[0][0], columns=semantic_map_keys_contract)
  
#     r, jdoc = fetch_tags_from_predicted_semantic_map(_id, tagsmap)
#     put_results_into_df(_id, r, tags, jdoc)
    

    subj_1hot = prediction[1][0]

    expected = decode_subj_prediction(y[1])[0]
    predicted = decode_subj_prediction(subj_1hot)[0]
    


    ev.at[_id, 'expected_subj'] = expected.name
    ev.at[_id, 'predicted_subj'] = predicted.name

    ev.at[_id, 'wrong'] = False
    if expected != predicted:
      print(f'{i} \t {_id} \t {expected} \t{predicted}')
      ev.at[_id, 'wrong'] = True

  return ev, tags

ev, tags = make_subj_predictions(umodel, sample_index)
ev[pd.notna(ev.predicted_subj)]

In [None]:
# _cols = [  'wrong' ]
# _tmp = ev[cols]
# errors_report = _tmp[ _tmp.wrong == True] #.sort_values('subject')
# print(len(errors_report), 'wrong subjects of', len(tags))
# errors_report 

subj_pred = ev[pd.notna(ev.predicted_subj)][pd.notna(ev.expected_subj)]
subj_df = subj_pred[['predicted_subj', 'expected_subj']].copy()
subj_df

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# print(subj_df['predicted_subj'].values)
labels = sorted(np.unique(subj_df['expected_subj'].values))
print (labels)

# cm = confusion_matrix(subj_df['expected_subj'].values, subj_df['predicted_subj'].values, labels=labels)
# cm

In [None]:
def make_report(umodel, subj_df):
  plot_cm(subj_df['expected_subj'].values, subj_df['predicted_subj'].values)
  
  img_path = os.path.join(umtm.work_dir, f'subjects-confusion-matrix-{umodel.name}.png')
  plt.savefig(img_path, bbox_inches='tight')

  report = classification_report(subj_df['expected_subj'], subj_df['predicted_subj'], digits=3)
  print(umodel.name)
  print(report)
  
  with open(os.path.join(umtm.work_dir, f'subjects-classification_report-{umodel.name}.txt'), "w") as text_file:
    text_file.write(report)


# subj_df = subj_df[['predicted_subj', 'expected_subj']].copy() #ev[~pd.isna(ev['predicted_subj'])]
make_report(umodel, subj_df)

### f1, subjects
- 005: weighted avg      0.837     0.814     0.817       
- 003: weighted avg      0.734     0.718     0.704       

# Evaluate tags detection

In [None]:
tags.fillna('-', inplace=True)
tags

In [None]:
def save_csv(d, f):
    fn = os.path.join(umtm.work_dir, f)
    d.to_csv(fn)

### Contract number validation

In [None]:
wrong_numbers = tags [ tags['number'] != tags['p-number']].sort_values('number')
print( f'Contract numbers: {len(wrong_numbers)} of {len(tags)}  ({100. * len(wrong_numbers) / len(tags) :0.1f}%) were detected wronggly')

save_csv( wrong_numbers[['p-number', 'number']], 'wrong_numbers.csv')

# wrong_numbers[['p-number', 'number']].tail(10)

In [None]:
tags ['sign_value_currency/currency'].head(100)

In [None]:
def conv(x):
    if type(x) is str:
        v = x.replace(',','.').replace(' ','')
    else: 
        v=x
    try:
        v=float(v)
    except:
        v=np.nan
    return v 

tags['n-p-sign_value_currency/value'] = pd.to_numeric( tags['p-sign_value_currency/value'].apply(conv) )
tags['n-sign_value_currency/value']   = pd.to_numeric( tags['sign_value_currency/value'].apply(conv) )

In [None]:
wrong_values = tags [  tags['n-p-sign_value_currency/value']  != tags['n-sign_value_currency/value']]
cols = ['n-p-sign_value_currency/value', 'n-sign_value_currency/value']
wrong_values = wrong_values[cols]

wrong_values ['val_err'] = \
    np.log1p( np.abs(wrong_values['n-p-sign_value_currency/value'] - wrong_values['n-sign_value_currency/value']))
wrong_values = wrong_values.sort_values('val_err', ascending=False)

print(len(wrong_values))
wrong_values.tail(24)

### Contract Org-1 validation

In [None]:


wrong_orgs1 = tags [ (tags['org-1-name'] != tags['p-org-1-name']) | (tags['org-2-name'] != tags['p-org-2-name']) ]
print( f'Org-1 name: {len(wrong_orgs1)} of {len(tags)}  ({100. * len(wrong_orgs1) / len(tags):0.1f}%) were detected incorrectly')

cols=['p-org-1-name', 'org-1-name', 'p-org-2-name', 'org-2-name']
save_csv( wrong_orgs1[cols], 'wrong_orgs1.csv')

wrong_orgs1[cols].head(20)

In [None]:
wrong_aliases = tags [ (tags['org-1-alias'] != tags['p-org-1-alias']) | (tags['org-2-alias'] != tags['p-org-2-alias']) ]
print( f'Aliases: {len(wrong_aliases)} of {len(tags)}  ({100. * len(wrong_aliases) / len(tags) : 0.1f}%) were detected incorrectly')

cols=['p-org-1-alias', 'org-1-alias', 'p-org-2-alias', 'org-2-alias']
save_csv( wrong_aliases[cols], 'wrong_aliases.csv')
# wrong_aliases[cols].head(10)

In [None]:
wrong_types = tags [ (tags['org-1-type'] != tags['p-org-1-type']) | (tags['org-2-type'] != tags['p-org-2-type'])]
print( f'Types: {len(wrong_types)} of {len(tags)}  ({100. * len(wrong_types) / len(tags) : 0.1f}%) were detected incorrectly')
cols=['p-org-1-type', 'p-org-2-type', 'org-1-type', 'org-2-type']
save_csv( wrong_types[cols], 'wrong_types.csv')
wrong_types[cols].head(10)

In [None]:
from collections import Counter
arrays = [ wrong_orgs1, wrong_types, wrong_numbers, wrong_aliases]
counter = Counter()
for a in arrays:
  for i in a.index:
   counter[i]+=1
 

print('Самый сложный документ: ', counter.most_common()[0][0])
print("Всего недочетов:", len(counter))

In [None]:
umtm.stats['errors'] = 0
for c in counter:
  umtm.stats.at[c, 'errors'] = counter[c]


calculate_samples_weights(umtm)
umtm._save_stats()
umtm.stats

## Single doc eval

In [None]:
if IN_COLAB:
  !wget https://raw.githubusercontent.com/nemoware/analyser/uber-models/tests/contract_db_1.json

  with open('contract_db_1.json', 'rb') as handle:    
    jdata = json.load(handle, object_hook=json_util.object_hook)

  jdoc = DbJsonDoc(jdata)

else:
  from integration.db import get_mongodb_connection
  from bson.objectid import ObjectId

  def get_doc(objid):
    logger.debug(f'fetching {objid}')
    db = get_mongodb_connection()
    documents_collection = db['documents']
    jdata =  documents_collection.find_one({'_id': ObjectId(objid)})
    return DbJsonDoc(jdata)

  SAMPLE_DOC_ID = counter.most_common()[0][0] #umtm.stats.index[10]
    
    
    
  SAMPLE_DOC_ID = '5eea27adc28b75807f3dae66'
  print('SAMPLE_DOC_ID:', SAMPLE_DOC_ID)
  dp = umtm.make_xyw(SAMPLE_DOC_ID)
  (emb, tok_f), (sm, subj), (sample_weight, subject_weight) = dp

  jdoc = get_doc(SAMPLE_DOC_ID)

In [None]:
from analyser.legal_docs import embedd_tokens

if IN_COLAB:
  embedder = ElmoEmbedder.get_instance('elmo')  # lazy init
  emb = embedd_tokens(jdoc.get_tokens_for_embedding(),
                             embedder,
                             verbosity=2,
                             log_key='tmp')

  tok_f = get_tokens_features(jdoc.get_tokens_map_unchaged().tokens)

In [None]:
###############
prediction = umodel.predict(   x=[  np.expand_dims(emb, axis=0), np.expand_dims(tok_f, axis=0)] , batch_size=1)
##############
print(len(prediction), umodel.name)
subj_1hot = prediction[1][0]
print('Subject:', decode_subj_prediction(subj_1hot))


tagging = pd.DataFrame( prediction[0][0], columns=seq_labels_contract)
plot_embedding(tagging, title = f'Predictions of {umodel.name}')

In [None]:
def render_slices(slices, tokens, attention_v, ht='') -> str:
  ht += '<ol>'
  for _s in slices:
    ht += '<li>'
    t = tokens[_s]
    l = attention_v[_s]
    ht += to_color_text(t, l, _range=(0, 1.2))
    ht += '<br><hr>'
    ht += '</li>'
  ht += '</ol>'

  return ht

for t in seq_labels_contract:
  spans = list( find_top_spans( tagging[t].values, threshold=0.3))  
  display(HTML(render_slices(spans, jdoc.get_tokens_map_unchaged().tokens, tagging[t].values)))


In [None]:

# mean_ = tagging.values.max(-1)*0.5
# print (mean_.shape)
# display(HTML( to_color_text (jdoc.get_tokens_map_unchaged().tokens[:24000],  mean_[:24000])))

In [None]:
ids = '5edbadd7da3678279fbcaabf
5edbc660da3678279fbcaeac
5edbc668da3678279fbcaf6e
5edbc65dda3678279fbcae56
5edbc66bda3678279fbcafe6
5edbc615da3678279fbcadc9'