In [None]:
DEBUG = False

In [None]:
import logging
import sys
import os


logger = logging.getLogger('eva;_ipynb')
logger.setLevel(logging.DEBUG)


ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter('%(levelname)s - %(asctime)s - [%(filename)s:%(lineno)d] - %(name)s - %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
logger.debug('--=logging started=--')

In [None]:
IN_COLAB = 'google.colab' in sys.modules
print ('Running in colab:', IN_COLAB)

In [None]:
if not IN_COLAB:
  nb_dir = os.path.split(os.getcwd())[0]
  if nb_dir not in sys.path:
      sys.path.append(nb_dir)
 

In [None]:
import analyser.hyperparams
import mlflow

print(mlflow.active_run())

In [None]:
sub_mlflow_run = None
if mlflow.active_run() is None:
    # mlflow.start_run(run_name='fetch trainset from db')
    ml_flow_url = os.environ.get('MLFLOW_URL', "http://192.168.10.38:5000")
    mlflow.set_tracking_uri(ml_flow_url)
    print(f'{ml_flow_url=}', 'set MLFLOW_URL env var to re-define')

    mlflow.set_experiment("Обучение анализатора")
    sub_mlflow_run = mlflow.start_run(nested=True)

# Imports

In [None]:
%matplotlib inline

 
import numpy as np
import pandas as pd

from pandas import DataFrame

from analyser.finalizer import get_doc_by_id
from analyser.documents import TextMap
from analyser.ml_tools import SemanticTag

from analyser.contract_parser import nn_get_tag_values
from analyser.contract_parser import nn_find_org_names, nn_get_subject, nn_get_contract_number, nn_get_contract_date
from analyser.parsing import AuditContext


from tf_support.tf_subject_model import decode_subj_prediction
from integration.db import get_doc_by_id
from bson import ObjectId
 
from trainsets.retrain_contract_uber_model import UberModelTrainsetManager
from tf_support.super_contract_model import semantic_map_keys_contract



In [None]:
from colab_support.renderer import HtmlRenderer
import matplotlib as matplotlib

from IPython.core.display import display, HTML

class DemoRenderer(HtmlRenderer):
  def render_color_text(self, tokens, weights, colormap='coolwarm', print_debug=False, _range=None, separator=' '):
    html = self.to_color_text(tokens, weights, colormap, print_debug, _range, separator=separator)
    display(HTML(html))

  def to_color_text(self, tokens, weights, colormap='coolwarm', print_debug=False, _range=None, separator=' '):
    return super()._to_color_text(tokens, weights, matplotlib, colormap=colormap, _range=_range, separator=separator)

renderer_ = DemoRenderer()


# renderer_.render_color_text(["слово 1", "слово 2"], np.array( [1, 0]), _range=(0,1))

# Prepare paths


In [None]:
from pathlib import Path
work_dir = Path(analyser.hyperparams.work_dir)
# del work_dir
# # print(work_dir)

training_reports_path = Path(analyser.hyperparams.__file__).parent.parent / 'training_reports/'
print(f'{training_reports_path=}')
print(f'{analyser.hyperparams.work_dir=}')
print(f'{work_dir=}')

# Loading data set meta

In [None]:
umtm = UberModelTrainsetManager (work_dir, training_reports_path)
umtm.load_contract_trainset_meta()
stats = umtm.stats
stats['sample_weight']=-1.0 #TODO: WHY?
stats['subject_weight']=-1.0
stats

## Validate trainset (take a sample)

In [None]:
from tf_support.super_contract_model import  validate_datapoint

# stats['valid'] = True
stats['error'] = ''

for i in stats.index:
  
  try:
    validate_datapoint(str(i), stats)

  except Exception as e:
    logger.error(e)

    stats.at[i, 'valid'] = False
    stats.at[i, 'error'] = str(e)
    
stats

In [None]:
stats_valid = stats[stats['valid']]
# stats_valid = stats_valid[stats_valid.source=='file']
len(stats_valid)

In [None]:
import json
import re
from bson import json_util

fn = work_dir / 'documents.json'
with open(fn) as file:
    file_data = json.load(file, object_hook=json_util.object_hook)    
    print(f'total docs in {fn} is {len(file_data)}')    
    

## validating data set

In [None]:

def validate_datapoint(id: str, meta: DataFrame):
  try:
    (emb, tok_f), (sm, subj), (sample_weight, subject_weight) = make_xyw(id, meta)
    if sm.shape[1] != len(semantic_map_keys_contract):
      mxs = f'semantic map shape is {sm.shape[1]}, expected is {len(semantic_map_keys_contract)} source={meta.at[id, "source"]}'
      raise ValueError(mxs)

  except Exception as e:
    raise e
    
# validate_datapoint('5deba9034ddc27bcf92dd383', stats)

In [None]:
if DEBUG:

    def test_date_tags_detector(doc_id):

        try:
            jd = DbJsonDoc(get_doc_by_id(ObjectId(doc_id)))


            (emb, tok_f), (sm, subj), (sample_weight, subject_weight) = make_xyw(doc_id, stats)
            sm = pd.DataFrame( sm, columns= semantic_map_keys_contract) 

            user_date_val = jd.user['attributes_tree']['contract']['date']            
            date_tag_____ = nn_get_contract_date     (jd.get_tokens_map_unchaged(), sm)

            if date_tag_____.value != user_date_val['value']:

                print(f"{date_tag_____.span=}\t\t{date_tag_____.value=} ")

                print(f"{user_date_val['span']=}\t\t{user_date_val['value']=}")
                print(f"{date_tag_____.value == user_date_val['value']}")
                print(doc_id, '_'*50)
        except:
            pass
    #         print(doc_id, 'no date')

    for k in range(0,len(stats[stats.source=='db'])):
        test_date_tags_detector(stats[stats.source=='db'].index[k])

In [None]:
from analyser.persistence import DbJsonDoc
from tf_support.super_contract_model import make_xyw

import traceback

docs = {}
errors = 0
for fd in file_data:
    try:
      validate_datapoint(str(fd['_id']), stats)
      docs [fd['_id']] =  DbJsonDoc(fd)
#       print (fd['_id'])
    except KeyError as e:
      errors += 1
      logger.error(f'No key in stats: {str(e)}')
    except Exception as e:
      errors += 1
      
      logger.error(f'{type(e)}, {str(e)}')
      traceback.print_exc()

print('errors count:', errors)
print(f'total docs in {fn} is {len(list(docs.values()))}')   

## Get sample doc (DEBUG)

In [None]:
if DEBUG:
    a_doc_from_json = list(docs.values())[1]
    print(a_doc_from_json.get_tokens_map_unchaged().text[:230])
    
    a_doc_from_json.get_attributes_tree()

In [None]:
%matplotlib inline


from colab_support.renderer import plot_embedding, plot_cm
from tf_support.super_contract_model import make_xyw

if DEBUG:
    SAMPLE_DOC_ID = str(a_doc_from_json.get_id())# stats_valid.index[0]


    # SAMPLE_DOC_ID = '5fdb2145542ce403c92b460c'
    # del a_doc_from_json

    print(f'{SAMPLE_DOC_ID=}')

    (emb, tok_f), (sm, subj), (sample_weight, subject_weight) = make_xyw(SAMPLE_DOC_ID, stats)


    print(f'semantic map shape {sm.shape=}')
    _crop = 700
    plot_embedding(tok_f[:_crop], title=f'Tokens features {SAMPLE_DOC_ID}') 
    plot_embedding(emb[:_crop], title=f'Embedding {SAMPLE_DOC_ID}') 
    plot_embedding(sm[:_crop], title=f'Semantic map {SAMPLE_DOC_ID}')

# Init Model 🦖

In [None]:
if 'TEST_FLOW' not in globals():
    TEST_FLOW = False
print(f'{TEST_FLOW=}')

In [None]:

if 'model_factory_fn' in globals():
    print ('*'*80)
    print('model_factory_fn defined by external process as', model_factory_fn.__name__)
else:
    from tf_support.super_contract_model import make_att_model
    model_factory_fn = make_att_model

print(f'{model_factory_fn=}')

see https://mlflow.org/docs/latest/models.html#keras-keras

In [None]:
import mlflow
from tf_support.tools import KerasTrainingContext

 
ctx = KerasTrainingContext(umtm.work_dir, session_index=21)
ctx.EVALUATE_ONLY = True
 
    
if 'umodel' in globals() and umodel is not None:
    print ('*'*80)
    print('umodel defined as', umodel.name)
else:
    # weights = Path(analyser.hyperparams.models_path) / f"{model_factory_fn.__name__}.h5"
    weights = training_reports_path / f'{model_factory_fn.__name__}.h5'
    mlflow.log_param('weights', str(weights))
    if weights.is_file():
        print (f'LOADING: {model_factory_fn} -- {weights}')
    
    
    umodel = ctx.init_model(model_factory_fn, trained=True, trainable=True, weights=weights)
    mlflow.log_param('model_name', str(umodel.name))
    mlflow.log_param('model_params', umodel.count_params())
    
    
umodel.trainable = False
umodel.summary()

# Evaluate models

## Evaluate single doc (self-test)

In [None]:
# sample_index = umtm.stats [umtm.stats['value']>0].index[2]
if DEBUG:
    print(f'{SAMPLE_DOC_ID=}')
    prediction = umodel.predict(x=[np.expand_dims(emb, axis=0), np.expand_dims(tok_f, axis=0)], batch_size=1)


    tagsmap = pd.DataFrame(prediction[0][0], columns=semantic_map_keys_contract)
    tagsmap_e = pd.DataFrame(sm, columns=semantic_map_keys_contract)
    delta = tagsmap - tagsmap_e 
    # .T
    plot_embedding(tagsmap[:_crop], f'Predicted Semantic Map {tagsmap.shape}')
    plot_embedding(delta[:_crop], title=f'DELTA Semantic map {tagsmap_e.shape}')
    plot_embedding(tagsmap_e[:_crop], title=f'EXPECTED Semantic map {tagsmap_e.shape}')

In [None]:
if DEBUG:
    print("mean delta", delta.abs().sum().sum() / tagsmap_e.sum().sum())
    print("sum of delatas", delta.abs().sum().sum())

In [None]:
if DEBUG:
    av = tagsmap.max(axis=1) #tagsmap['amount-begin'] + tagsmap['vat-begin'] + tagsmap['number-begin'] + tagsmap['org-name-begin']

    # av = tagsmap.sum(axis=1)
    renderer_.render_color_text(a_doc_from_json.get_tokens_map_unchaged().tokens[:600], av[:600])

### Getting tag values from inferred semantic map

In [None]:
ac = AuditContext()

#### Orgs

In [None]:
if DEBUG:
    cas = nn_find_org_names(a_doc_from_json.get_tokens_map_unchaged(), tagsmap, ac)
    if cas:
        if len(cas)>0:

            print(cas[0].name)
            print(cas[0].type)
            print(cas[0].alias)
        if len(cas)>1:
            print()
            print(cas[1].name)
            print(cas[1].type)
            print(cas[1].alias)

#### Date/number

In [None]:
if DEBUG:
    date_tag = nn_get_contract_date     (a_doc_from_json.get_tokens_map_unchaged(), tagsmap) 
    number_tag = nn_get_contract_number (a_doc_from_json.get_tokens_map_unchaged(), tagsmap)
    if date_tag:
        print( f'{date_tag.value=}')
    if number_tag:
        print( f'{number_tag.value=}' )
    
if DEBUG:
    attention = tagsmap['date' + '-begin'].values.copy()

    threshold = max(attention.max() * 0.8, 0.1)
    print(f'{attention.max()=}')

#### Amount

In [None]:
if DEBUG:
    textmap = a_doc_from_json.get_tokens_map_unchaged()

In [None]:
from pandas import DataFrame
from analyser.schemas import ContractPrice, merge_spans
from analyser.legal_docs import find_value_sign
from analyser.transaction_values import ValueSpansFinder
from analyser.text_tools import to_float
from analyser.contract_parser import nn_find_contract_value
if DEBUG:
    #---
    cps = nn_find_contract_value(textmap, tagsmap)
    if cps:
      print(str(cps[0].get_span()))
      for k in cps[0].list_children():
        print(str(k))

      print()
      print()

      print('brutto', str(cps[0].amount_brutto))
      print('netto', str(cps[0].amount_netto))
      print('amount', str(cps[0].amount))
      print('vat', str(cps[0].vat))
    else:
      print('nothing found')

#### Miscl.

In [None]:
if DEBUG:
    thresholds = dict(tagsmap.max()*.8)
    thresholds

#### Subj

In [None]:
if DEBUG:
    subject_tag = nn_get_tag_values('subject',  a_doc_from_json.get_tokens_map_unchaged(), tagsmap, max_tokens=200, threshold=0.02, limit=1, return_single=True)

    print(subject_tag)

### Draw tags TODO:

In [None]:
import matplotlib.pyplot as plt

if DEBUG:
    tags_hl = np.zeros(len(textmap))


# def hl(tag):
#   try:
#     tags_hl [ tag.span[0]:tag.span[1]] +=1
#   except:
#     pass
  

# if cps:
#   hl(cps[0].amount_brutto)
#   hl(cps[0].amount_netto)
#   hl(cps[0].amount)
#   hl(cps[0].vat)
#   hl(cps[0].sign)
#   hl(cps[0].currency)
#   hl(cps[0])


# if cas and len(cas)>0:
#     hl(cas[0].name)
#     hl(cas[0].type)
#     hl(cas[0].alias)

    
# if cas and len(cas)>1:
#     hl(cas[1].name)
#     hl(cas[1].type)
#     hl(cas[1].alias)

# hl(number_tag)
# hl(date_tag)


# hl(subject_tag)

# renderer_.render_color_text(a_doc_from_json.get_tokens_map_unchaged().tokens[:160], tags_hl[:160])

# Reporting

In [None]:
ev =   umtm.stats.copy()
tags =          pd.DataFrame()
errors_report = pd.DataFrame()


In [None]:
userdocs = umtm.stats[umtm.stats.unseen==False]
userdocs = userdocs[userdocs.source=='db']
userdocs = userdocs[userdocs.score < 50000]
userdocs = userdocs[userdocs['valid']==True]

userdocs

In [None]:
from tf_support.super_contract_model import t_semantic_map_keys_price, semantic_map_keys, t_semantic_map_keys_common, t_semantic_map_keys_org
semantic_keys_numeric = ['amount', 'amount_brutto', 'amount_netto']

In [None]:

for v in semantic_map_keys:
    for s in ['_expected', '_predicted']:
        userdocs[f'{v}{s}'] = ''
        userdocs[f'{v}{s}'] = userdocs[f'{v}{s}'].astype(str)
        
for v in semantic_keys_numeric:
    for s in ['_expected', '_predicted']:
        userdocs[f'{v}{s}'] = np.NaN
        userdocs[f'{v}{s}'] = userdocs[f'{v}{s}'].astype(float)
        


In [None]:
org_keys=['name', 'alias', 'type']
org_numbered_keys=[ f'org-1-{v}' for v in org_keys]
org_numbered_keys+=[ f'org-2-{v}' for v in org_keys]
org_numbered_keys

In [None]:
from analyser.text_tools import to_float, span_len

def fix_contract_number(tag: SemanticTag, textmap: TextMap) -> SemanticTag or None:
  if tag:
    span = [tag.span[0], tag.span[1]]
    for i in range(tag.span[0], tag.span[1]):
      if i < 0 or i >= len(textmap):
        msg = f'{i=} {textmap=} {len(textmap)=} {tag=} {tag.span=}'
        logger.error(msg)
        raise ValueError(msg)

      t = textmap[i]
      t = t.strip().lstrip('№').lstrip().lstrip(':').lstrip('N ').lstrip().rstrip('.')
      if t == '':
        span[0] = i + 1
    tag.span = span
  if span_len(tag.span) == 0:
    return None

  return tag

In [None]:
128*12

In [None]:
batch_size = 96
maxlen = 128*12

analyser.contract_parser.fix_contract_number=fix_contract_number
 
def load_doc_as_table_row(_id, df, col_suffix='_expected'):
    doc = get_doc_by_id(ObjectId(_id))
    if doc is None:
        print (f'{_id} not found in db')
        return
    
    jd = DbJsonDoc(doc)
#     print(jd.get_attribute_value('number'))
#     df.at[ _id, f'analyze_timestamp{col_suffix}']=jd.analysis['analyze_timestamp']
    
    tree = jd.get_attributes_tree()
    df.at[ _id, f'number{col_suffix}'] = jd.get_attribute_value('number')
    df.at[ _id, f'date{col_suffix}']   = jd.get_attribute_value('date')
    df.at[ _id, f'subject{col_suffix}']= jd.get_attribute_value('subject')
    
    orgs = tree.get('orgs', [])
#     for i, o in enumerate(orgs):
    if len(orgs)>0:
        o1=orgs[0]
    else:
        o1={}
        
    if len(orgs)>1:
        o2=orgs[1]
    else:
        o2={}
        
    for part in org_keys:
        v1 = o1.get(part, {}).get('value', "").lower()
        v2 = o2.get(part, {}).get('value', "").lower()
        vv=sorted([v1,v2])
        df.at[ _id, f'org-1-{part}{col_suffix}'] = vv[1]
        df.at[ _id, f'org-2-{part}{col_suffix}'] = vv[0]

    
    for v in t_semantic_map_keys_price:
        df.at[ _id, f'{v}{col_suffix}'] =  tree.get('price', {}).get(v, {}).get('value')

    df.at[ _id, f'analysis.version{col_suffix}'] = jd.analysis['version']
    return jd
    
    
    
    
def interpret_prediction(_id, tagsmap, df):
    col_suffix="_predicted"
    
    doc = load_doc_as_table_row(_id, df)
    if doc is None: 
        return
    
    tokens = doc.get_tokens_map_unchaged()
    
    #---
    # ORGS ------------------
    orgs = nn_find_org_names(tokens, tagsmap, ac)
    if len(orgs)>0:
        o1 = orgs[0]
    else:
        o1={}
        
    if len(orgs)>1:
        o2 = orgs[1]
    else:
        o2={}
        
    for part in ['name', 'alias', 'type']:
        part_tag1 = getattr(o1, part, {})
        part_tag2 = getattr(o2, part, {})
        
        v1 = getattr(part_tag1, 'value', '').lower()
        v2 = getattr(part_tag2, 'value', '').lower()
#         vv= [v1,v2]
        vv=sorted([v1,v2])
        df.at[ _id, f'org-1-{part}{col_suffix}'] = vv[1] 
        df.at[ _id, f'org-2-{part}{col_suffix}'] = vv[0] 
 
    # PRICE ------------------
    cps = nn_find_contract_value(tokens, tagsmap)
    if cps:
        cps=cps[0]
        for v in t_semantic_map_keys_price:
            part = getattr(cps, v)
            if part:
                df.at[ _id, f'{v}{col_suffix}'] = part.value
                 
 
    # DATE NUMBER ------------------
    number_tag = nn_get_contract_number(tokens, tagsmap)
    date_tag =   nn_get_contract_date  (tokens, tagsmap) 
    
        
    if number_tag:
        df.at[ _id, f'number{col_suffix}'] = str(number_tag.value)
    else:
        df.at[ _id, f'number{col_suffix}'] = ''
        
        
    if date_tag:
        df.at[ _id, f'date{col_suffix}'] = str(date_tag.value)
    else:
        df.at[ _id, f'date{col_suffix}'] = ''
 

validation_set = userdocs
if TEST_FLOW:
    validation_set = userdocs[0:20]

    
    
for i in range(0, len(validation_set), batch_size):
    batch = userdocs[i:i+batch_size]
    actual_batch_size=len(batch)
    print(f'{actual_batch_size=}')
    batch_input_emb=[]
    batch_input_token_f=[]
    for _id in batch.index.values:
 
        dp = make_xyw(_id, userdocs)
        dp = umtm.trim_maxlen( dp, 0, maxlen  )
        (emb, tok_f), (sm, subj), (sample_weight, subject_weight) = dp

        batch_input_emb.append(emb)
        batch_input_token_f.append(tok_f)
        
        expected_subj =  decode_subj_prediction(subj)[0]
        userdocs.at[_id, 'subject_expected'] = expected_subj.name
 
        # weights_subj.append(subject_weight)

    del _id
    del dp

    
    
    dps =  [np.array(batch_input_emb), np.array(batch_input_token_f)]
 
    
    predictions = umodel.predict(x=dps, batch_size=actual_batch_size)
    del batch_input_emb
    del batch_input_token_f
#     print ('len(predictions)', len(predictions))
    
    p_tags = predictions[0]
    p_subj = predictions[1]
    for k,_id in enumerate(batch.index.values):
#     for k in range(0, len(p_tags)):
#         print (k, p_tags.shape, p_subj.shape)
                   
        tagsmap = pd.DataFrame(p_tags[k], columns=semantic_map_keys_contract)
        interpret_prediction(_id, tagsmap, userdocs)
        
        subj_1hot = p_subj[k]

#         expected =  decode_subj_prediction(subj[k])
        predicted = decode_subj_prediction(subj_1hot)[0]
        userdocs.at[_id, 'subject_predicted'] = predicted.name
        
        
#         print(predicted)
            
#         print(_id, trim_dp(x[0]).shape)

# userdocs[['org-1-name_expected','subject_expected','subject_predicted','date_expected','date_predicted', 'number_expected','number_predicted']].head(30)
userdocs[
  ['org-1-type_expected', 'org-1-type_predicted', 'org-1-name_expected', 'org-1-name_predicted', 'org-1-alias_expected', 'org-1-alias_predicted', 
   'org-2-type_expected', 'org-2-type_predicted', 'org-2-name_expected', 'org-2-name_predicted', 'org-2-alias_expected', 'org-2-alias_predicted', 
   'subject_expected', 'subject_predicted', 'date_expected',
   'date_predicted', 'number_expected', 'number_predicted']].head(30)




### Clean predicted/expected values

In [None]:


# userdocs['amount_expected']=userdocs['amount_expected'].replace('',np.NaN).astype(float)
# userdocs['amount_predicted']=userdocs['amount_predicted'].replace('',np.NaN).astype(float)

# userdocs['amount_brutto_predicted'].replace('',np.NaN).astype(float)
# userdocs['amount_brutto_expected'].replace('',np.NaN).astype(float)

columns = []
for v in semantic_keys_numeric:
    for s in ['_expected', '_predicted' ]:
        col = f'{v}{s}'
        userdocs[col] = userdocs[col].replace('',np.NaN).replace(np.NaN, -1).astype(int)
        columns.append(col)
        
userdocs['date_predicted'] = userdocs['date_predicted'].astype(str)        
userdocs['date_expected'] = userdocs['date_expected'].astype(str)        
userdocs[columns]

In [None]:
userdocs=userdocs.replace('None', '')
userdocs=userdocs.replace('none', '')
userdocs=userdocs.replace('nan', '')

userdocs.date_expected =  userdocs.date_expected.replace(np.NaN, '')
userdocs.date_predicted = userdocs.date_predicted.replace(np.NaN, '')

userdocs.number_expected =  userdocs.number_expected.replace(np.NaN, '')
userdocs.number_predicted = userdocs.number_predicted.replace(np.NaN, '')

userdocs.vat_unit_expected =  userdocs.vat_unit_expected.replace(np.NaN, '')
userdocs.vat_unit_predicted = userdocs.vat_unit_predicted.replace(np.NaN, '')

userdocs.vat_expected =  userdocs.vat_expected.replace(np.NaN, '')
userdocs.vat_predicted = userdocs.vat_predicted.replace(np.NaN, '')

userdocs.sign_expected =  userdocs.sign_expected.replace(np.NaN, '')
userdocs.sign_predicted = userdocs.sign_predicted.replace(np.NaN, '')

userdocs.currency_expected =  userdocs.currency_expected.replace(np.NaN, '')
userdocs.currency_predicted = userdocs.currency_predicted.replace(np.NaN, '')
userdocs[['date_expected', 'date_predicted', 'number_expected', 'number_predicted']]

In [None]:
def select_wrong(userdocs, key):
    s= userdocs[userdocs[f'{key}_expected'] != userdocs[f'{key}_predicted']][[f'{key}_expected', f'{key}_predicted']]
    p=float(len(s)) / len(userdocs)
#     print(len(s), p)
    return s,p

# Tags report
# accuracy report

In [None]:
# import mlflow

# from mlflow.models import Model
# model1 = mlflow.tensorflow.load_model("file:///root/artem/analyser/mlruns/0/c9389e6d6a87415c9488079fd46c09d2/artifacts/model")
# model1.summary()

In [None]:


 
userdocs['number_of_errors'] = 0
report = pd.DataFrame()

n   = 0
avg = 0

for k in t_semantic_map_keys_common[1:] + t_semantic_map_keys_price + org_numbered_keys :
    s, p = select_wrong(userdocs, k)
    for _id, _ in s.iterrows():
       userdocs.at[_id, 'number_of_errors'] += 1
    
    acc = 1.0 - p
    report.at[k, 'accuracy']=f"{acc:.1%}"
    mlflow.log_metric(f"accuracy_{k}", acc)
    
    n += 1
    avg += acc
    
avg = avg / n

report.at['ALL', 'accuracy']=f"{avg:.1%}"
mlflow.log_metric("accuracy_TAGS", avg)
report.to_csv(umtm.reports_dir /'attributes_accuracy.csv')    
mlflow.log_artifact(umtm.reports_dir / 'attributes_accuracy.csv')

# 81.6%
report    

# Worst docs list

In [None]:
userdocs[['number_of_errors']].sort_values('number_of_errors', ascending=False)[:30]

In [None]:
s,p = select_wrong(userdocs, "date")
s

In [None]:

s,p = select_wrong(userdocs, "currency")
s

# Writing subject report

In [None]:
_t=userdocs[userdocs.subject_predicted != userdocs.subject_expected]

for i, row in _t.iterrows():
    _t.at[i,'link'] = f'https://gpn-audit.nemosoft.ru/#/pre-audit/edit/{i}'
# _t

_report = _t[['link','analyze_date']].copy()
_report['Предмет, выявленный Шайтан-Арбой']=_t.subject_predicted
_report['Предмет, выявленный роевым био-интеллектом']=_t.subject_expected
_report.to_csv(umtm.reports_dir /'subjects_to_check.csv')


In [None]:
_report

In [None]:
ev = userdocs.copy()

In [None]:
# _cols = [  'wrong' ]
# _tmp = ev[cols]
# errors_report = _tmp[ _tmp.wrong == True] #.sort_values('subject')
# print(len(errors_report), 'wrong subjects of', len(tags))
# errors_report 

subj_pred = ev[pd.notna(ev.subject_predicted)][pd.notna(ev.subject_expected)]
subj_df = subj_pred[['subject_predicted', 'subject_expected']].copy()
subj_df

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# print(subj_df['predicted_subj'].values)
labels = sorted(np.unique(subj_df['subject_expected'].values))
print (labels)

# cm = confusion_matrix(subj_df['expected_subj'].values, subj_df['predicted_subj'].values, labels=labels)
# cm

In [None]:
report = classification_report(subj_df.subject_expected, subj_df.subject_predicted, digits=3, output_dict=True)
report['weighted avg']
mlflow.log_metrics(report['weighted avg'])
mlflow.log_metric('subject_F1',report['weighted avg']['f1-score'])
mlflow.log_metric('subject_F1_support',report['weighted avg']['support'])
mlflow.log_metric('subject_precision',report['weighted avg']['precision'])
mlflow.log_metric('subject_recall',report['weighted avg']['recall'])

In [None]:
def make_report(umodel, subj_df):
  plot_cm(subj_df.subject_expected.values, subj_df.subject_predicted.values, figsize=(12, 12))
  
  img_path = umtm.reports_dir / f'subjects-confusion-matrix-{umodel.name}.png'
  plt.savefig(img_path, bbox_inches='tight')
  mlflow.log_artifact(img_path)

  report = classification_report(subj_df.subject_expected, subj_df.subject_predicted, digits=3)
  print(umodel.name)
  print(report)

  fn = umtm.reports_dir / f'subjects-classification_report-{umodel.name}.txt'
  with open(fn, "w") as text_file:
    text_file.write(report)
 
  mlflow.log_artifact(fn)


# subj_df = subj_df[['predicted_subj', 'expected_subj']].copy() #ev[~pd.isna(ev['predicted_subj'])]
make_report(umodel, subj_df)

In [None]:
if sub_mlflow_run is not None:
    print(active_mlflow_run.info)
    mlflow.end_run()