In [None]:
import logging
import sys
import os

In [None]:
DEBUG = False
DISABLE_GPU = False


In [None]:
if 'USE_CONTROL_SET' not in globals():
    USE_CONTROL_SET = True

if 'COLLECTION_NAME' not in globals():
    COLLECTION_NAME = 'documents_temp'
    
print (f'{COLLECTION_NAME=}')
print (f'{USE_CONTROL_SET=}')

In [None]:

# DISABLE GPU
if DISABLE_GPU:
    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
print(f'{DISABLE_GPU=}')

In [None]:

logger = logging.getLogger('eval_ipynb')
logger.setLevel(logging.DEBUG)


ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter('%(levelname)s - %(asctime)s - [%(filename)s:%(lineno)d] - %(name)s - %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
logger.info('--=logging started=--')

nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
  sys.path.append(nb_dir)

import analyser.hyperparams

In [None]:
import mlflow
print('mlflow.active_run', mlflow.active_run())

In [None]:
sub_mlflow_run = None
if mlflow.active_run() is None:

    ml_flow_url = os.environ.get('MLFLOW_URL', "http://192.168.10.38:5000")
    mlflow.set_tracking_uri(ml_flow_url)
    logger.warn(f'set MLFLOW_URL env var to re-define; MLFLOW_URL={ml_flow_url}')

    mlflow.set_experiment("Обучение анализатора")
    sub_mlflow_run = mlflow.start_run(nested=True)
 
    print('sub_mlflow_run', sub_mlflow_run)
    
mlflow.set_tag("release.version", analyser.__version__)
mlflow.set_tag("test_use_control_set", USE_CONTROL_SET)
mlflow.set_tag("test_db_collection", COLLECTION_NAME)


### Imports

In [None]:
%matplotlib inline

from pathlib import Path
 
import numpy as np
import pandas as pd
from pandas import DataFrame

import json
import re

from bson import json_util
from bson import ObjectId

import traceback


from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

from analyser.schemas import ContractPrice, merge_spans
from analyser.finalizer import get_doc_by_id
from analyser.documents import TextMap
from analyser.ml_tools import SemanticTag
from analyser.parsing import AuditContext
from analyser.persistence import DbJsonDoc
from analyser.legal_docs import find_value_sign
from analyser.transaction_values import ValueSpansFinder
from analyser.text_tools import to_float, span_len

from analyser.contract_parser import nn_get_tag_values, nn_find_contract_value
from analyser.contract_parser import nn_find_org_names, nn_get_subject, nn_get_contract_number, nn_get_contract_date
from analyser.contract_parser import fix_contract_number

from tf_support.tools import KerasTrainingContext
from tf_support.tf_subject_model import decode_subj_prediction
from tf_support.super_contract_model import make_xyw, semantic_map_keys_contract
from tf_support.super_contract_model import validate_datapoint

from integration.db import get_doc_by_id

from trainsets.retrain_contract_uber_model import UberModelTrainsetManager

from colab_support.renderer import plot_embedding, plot_cm
from colab_support.renderer import HtmlRenderer

from IPython.display import display, HTML, Markdown

from tf_support.super_contract_model import t_semantic_map_keys_price, semantic_map_keys, t_semantic_map_keys_common, t_semantic_map_keys_org


semantic_keys_numeric = ['amount', 'amount_brutto', 'amount_netto']
org_keys=['name', 'alias', 'type']
org_numbered_keys=[ f'org-1-{v}' for v in org_keys]
org_numbered_keys+=[ f'org-2-{v}' for v in org_keys]
org_numbered_keys

In [None]:
class DemoRenderer(HtmlRenderer):
  def render_color_text(self, tokens, weights, colormap='coolwarm', print_debug=False, _range=None, separator=' '):
    html = self.to_color_text(tokens, weights, colormap, print_debug, _range, separator=separator)
    display(HTML(html))

  def to_color_text(self, tokens, weights, colormap='coolwarm', print_debug=False, _range=None, separator=' '):
    return super()._to_color_text(tokens, weights, matplotlib, colormap=colormap, _range=_range, separator=separator)

renderer_ = DemoRenderer()

# Prepare paths


In [None]:


work_dir = Path(analyser.hyperparams.work_dir)
training_reports_path = Path(analyser.hyperparams.__file__).parent.parent / 'training_reports/'

print(f'{training_reports_path=}')
print(f'{analyser.hyperparams.work_dir=}')
print(f'{work_dir=}')

In [None]:
t_semantic_map_keys_price

# Loading control test set from DB

In [None]:
from integration.db import get_mongodb_connection
from bson import ObjectId

def doc_as_table_row(jd:DbJsonDoc, df, col_suffix='_expected'):
    _id = str(jd._id)
    tree = jd.get_attributes_tree()
    df.at[ _id, f'number{col_suffix}'] = jd.get_attribute_value('number')
    df.at[ _id, f'date{col_suffix}']   = jd.get_attribute_value('date')
    df.at[ _id, f'subject{col_suffix}']= jd.get_attribute_value('subject')
    df.at[ _id, 'subject']= jd.get_attribute_value('subject')

    orgs = tree.get('orgs', [])

    if len(orgs)>0:
        o1=orgs[0]
    else:
        o1={}

    if len(orgs)>1:
        o2=orgs[1]
    else:
        o2={}

    for part in org_keys:
        v1 = o1.get(part, {}).get('value', "").lower()
        v2 = o2.get(part, {}).get('value', "").lower()
        vv=sorted([v1,v2])
        df.at[ _id, f'org-1-{part}{col_suffix}'] = vv[1]
        df.at[ _id, f'org-2-{part}{col_suffix}'] = vv[0]


    for v in t_semantic_map_keys_price[0:4]:
        df.at[ _id, f'{v}{col_suffix}'] = -1
    for v in t_semantic_map_keys_price[5:]:
        df.at[ _id, f'{v}{col_suffix}'] = ''
        
    for v in t_semantic_map_keys_price:
        df.at[ _id, f'{v}{col_suffix}'] =  tree.get('price', {}).get(v, {}).get('value')

    df.at[ _id, f'analysis.version{col_suffix}'] = jd.analysis['version']
    return jd



def load_doc_as_table_row(documents_collection, _id, df, col_suffix='_expected'):
    doc = documents_collection.find_one({'_id': ObjectId(_id)})  
    if doc is None:
        print (f'{_id} not found in db')
        return

    jd = DbJsonDoc(doc)
    doc_as_table_row(jd, df, col_suffix)
    return jd

In [None]:


db = get_mongodb_connection()
documents_collection = db[COLLECTION_NAME]
    
    
 
if USE_CONTROL_SET:    
    query = {
      '$and': [
        {"parse.documentType":{ '$in': ["AGREEMENT", "CONTRACT", "SUPPLEMENTARY_AGREEMENT"] }  },      
        { 'subset': 'CONTROL_TEST'}
      ]
    }    

    res = documents_collection.find(filter=query, 
                                    projection={'_id': True, 'user.updateDate':True, 'state':True, 'parse.documentType':True}
                                   ).limit(5000)


    test_meta = DataFrame()

    for i in res:
        load_doc_as_table_row(documents_collection, str(i["_id"]), test_meta)

    test_meta['sample_weight'] = 1
    test_meta['subject_weight'] = 1

    # test_ids  = [i["_id"] for i in res]
    # print(len(test_ids))

    _s = f"#### {len(test_meta)} -- total test docs in {COLLECTION_NAME} collection"
    display(Markdown(_s))
    
    # mlflow.log_param('test set', len(test_meta))

In [None]:
from trainsets.retrain_contract_uber_model import save_contract_data_arrays
from IPython.display import clear_output


def recreate_data_point(_id:str, test_meta):
    try:
        doc = documents_collection.find_one({'_id': ObjectId(_id)})  
        jd = DbJsonDoc(doc)
        save_contract_data_arrays(jd)

        test_meta.at[_id, 'valid'] = True
        test_meta.at[_id, 'error'] = ''

    except Exception as e:
        # logger.error(e)
        logger.exception(e)

        test_meta.at[_id, 'valid'] = False
        test_meta.at[_id, 'error'] = str(e)
            
                        
if USE_CONTROL_SET:
    test_meta['error']=''
    test_meta['valid']=True

    
    for k, i in enumerate(test_meta.index):
      print(i, 'validating....')
      if k % 10 == 0:
          clear_output(wait=True)

      try:
        validate_datapoint(str(i), test_meta)
        print(i, 'is ok')
      except Exception as e:
        logger.error(e)
        # logger.exceptoin(e)

        test_meta.at[i, 'valid'] = False
        test_meta.at[i, 'error'] = str(e)

        recreate_data_point(str(i), test_meta)
    

# Loading data set meta

In [None]:
if not USE_CONTROL_SET:

    umtm = UberModelTrainsetManager (work_dir, training_reports_path)
    umtm.load_contract_trainset_meta()
    stats = umtm.stats
    stats['sample_weight']  = -1.0 #TODO: describe why?
    stats['subject_weight'] = -1.0

    _s = f"#### {len(stats)} -- total records in contract_trainset_meta"
    display(Markdown(_s))

## Validate data set 

In [None]:
if not USE_CONTROL_SET:

    # stats['valid'] = True
    stats['error'] = ''

    for i in stats.index:

      try:
        validate_datapoint(str(i), stats)

      except Exception as e:
        logger.error(e)

        stats.at[i, 'valid'] = False
        stats.at[i, 'error'] = str(e)
         
        recreate_data_point(str(i), stats)



In [None]:
if not USE_CONTROL_SET:
    display(Markdown(f'####  {len(stats[stats.valid == False])} invalid records'))    
    stats[stats.valid == False]['error']

    stats_valid = stats[stats['valid']]
    display(Markdown(f'####  {len(stats_valid)} valid records'))


## [Debug] Reading legacy docs from json

In [None]:
fn = work_dir / 'documents.json'
legacy_json_exists = fn.is_file()

if DEBUG and legacy_json_exists:

    with open(fn) as file:
        file_data = json.load(file, object_hook=json_util.object_hook)    
        display(Markdown(f'#### {len(file_data)} total docs in {fn}'))  
    

### [Debug] Validating legacy data set

In [None]:
json_file_meta = DataFrame()
json_file_meta['sample_weight']=1
json_file_meta['subject_weight']=1

if DEBUG and legacy_json_exists:

    docs = {}
    errors = 0

    for fd in file_data:
        try:
          # validate_datapoint(str(fd['_id']), json_file_meta)
          jd = DbJsonDoc(fd)
          docs [fd['_id']] =  jd
          doc_as_table_row(jd, json_file_meta, col_suffix='' )
        
        except NameError as e:
            raise e
            
        except KeyError as e:
          errors += 1
          logger.error(f'No key in json_file_meta: {str(e)}')
            
        except Exception as e:
          errors += 1

          logger.error(f'{type(e)}, {str(e)}')
          traceback.print_exc()

    display(Markdown(f'#### {errors} invalid docs in in {fn}')) 
    display(Markdown(f'#### {len( list( docs.values() )  )} valid docs in {fn}'))

## [Debug] Get sample doc

In [None]:
if DEBUG and legacy_json_exists:
    a_doc_from_json = list(docs.values())[1]
    print(a_doc_from_json.get_tokens_map_unchaged().text[:230])
    
    a_doc_from_json.get_attributes_tree()

In [None]:
%matplotlib inline



if DEBUG and legacy_json_exists:
    SAMPLE_DOC_ID = str(a_doc_from_json.get_id()) # stats_valid.index[0]

    print(f'{SAMPLE_DOC_ID=}')

    (emb, tok_f), (sm, subj), (sample_weight, subject_weight) = make_xyw(SAMPLE_DOC_ID, json_file_meta)


    print(f'semantic map shape {sm.shape=}')
    _crop = 700
    plot_embedding(tok_f[:_crop], title=f'Tokens features {SAMPLE_DOC_ID}') 
    plot_embedding(emb[:_crop], title=f'Embedding {SAMPLE_DOC_ID}') 
    plot_embedding(sm[:_crop], title=f'Semantic map {SAMPLE_DOC_ID}')

# Init Model 🦖

In [None]:
if 'TEST_FLOW' not in globals():
    TEST_FLOW = False
print(f'{TEST_FLOW=}')

In [None]:

if 'model_factory_fn' in globals():
    print ('*'*80)
    print('model_factory_fn defined by external process as', model_factory_fn.__name__)
else:
    from tf_support.super_contract_model import make_att_model
    model_factory_fn = make_att_model

print(f'{model_factory_fn=}')

In [None]:

ctx = KerasTrainingContext(work_dir, session_index=21)
ctx.EVALUATE_ONLY = True
 
    
if 'umodel' in globals() and umodel is not None:
    print ('*'*80)
    print('umodel defined as', umodel.name)
else:
 
    weights = training_reports_path / f'{model_factory_fn.__name__}.h5'
    mlflow.log_param('weights', str(weights))
    
    logger.warn (f'LOADING: {model_factory_fn} -- {weights}')
    
    
    umodel = ctx.init_model(model_factory_fn, trained=True, trainable=True, weights=weights)
    mlflow.log_param('model_name', str(umodel.name))
    mlflow.log_param('model_params', umodel.count_params())
    
    
umodel.trainable = False
umodel.summary()

# [Debug] Check model on a sigle doc

## [Debug] Evaluate single doc 

In [None]:
# sample_index = umtm.stats [umtm.stats['value']>0].index[2]

if DEBUG and legacy_json_exists:
    print(f'{SAMPLE_DOC_ID=}')
    prediction = umodel.predict(x=[np.expand_dims(emb, axis=0), np.expand_dims(tok_f, axis=0)], batch_size=1)


    tagsmap = pd.DataFrame(prediction[0][0], columns=semantic_map_keys_contract)
    tagsmap_e = pd.DataFrame(sm, columns=semantic_map_keys_contract)
    delta = tagsmap - tagsmap_e 
    # .T
    plot_embedding(tagsmap[:_crop], f'Predicted Semantic Map {tagsmap.shape}')
    plot_embedding(delta[:_crop], title=f'DELTA Semantic map {tagsmap_e.shape}')
    plot_embedding(tagsmap_e[:_crop], title=f'EXPECTED Semantic map {tagsmap_e.shape}')

In [None]:
if DEBUG and legacy_json_exists:
    print("mean delta", delta.abs().sum().sum() / tagsmap_e.sum().sum())
    print("sum of delatas", delta.abs().sum().sum())

In [None]:
import matplotlib
if DEBUG and legacy_json_exists:
    av = tagsmap.max(axis=1) #tagsmap['amount-begin'] + tagsmap['vat-begin'] + tagsmap['number-begin'] + tagsmap['org-name-begin']

    # av = tagsmap.sum(axis=1)
    renderer_.render_color_text(a_doc_from_json.get_tokens_map_unchaged().tokens[:600], av[:600])

### [Debug mode only] Getting tag values from inferred semantic map

In [None]:
ac = AuditContext()

#### Orgs

In [None]:
if DEBUG and legacy_json_exists:
    cas = nn_find_org_names(a_doc_from_json.get_tokens_map_unchaged(), tagsmap, ac)
    if cas:
        if len(cas)>0:

            print(cas[0].name)
            print(cas[0].type)
            print(cas[0].alias)
        if len(cas)>1:
            print()
            print(cas[1].name)
            print(cas[1].type)
            print(cas[1].alias)

#### Date/number

In [None]:
if DEBUG and legacy_json_exists:
    date_tag = nn_get_contract_date     (a_doc_from_json.get_tokens_map_unchaged(), tagsmap) 
    number_tag = nn_get_contract_number (a_doc_from_json.get_tokens_map_unchaged(), tagsmap)
    if date_tag:
        print( f'{date_tag.value=}')
    if number_tag:
        print( f'{number_tag.value=}' )
    
 
    attention = tagsmap['date' + '-begin'].values.copy()

    threshold = max(attention.max() * 0.8, 0.1)
    print(f'{attention.max()=}')

#### Amount

In [None]:
if DEBUG and legacy_json_exists:
    textmap = a_doc_from_json.get_tokens_map_unchaged()

In [None]:
if DEBUG and legacy_json_exists:
    #---
    cps = nn_find_contract_value(textmap, tagsmap)
    if cps:
      print(str(cps[0].get_span()))
      for k in cps[0].list_children():
        print(str(k))

      print()
      print()

      print('brutto', str(cps[0].amount_brutto))
      print('netto', str(cps[0].amount_netto))
      print('amount', str(cps[0].amount))
      print('vat', str(cps[0].vat))
    else:
      print('nothing found')

#### Miscl.

In [None]:
if DEBUG and legacy_json_exists:
    thresholds = dict(tagsmap.max()*.8)
    thresholds

#### Subj

In [None]:
if DEBUG and legacy_json_exists:
    subject_tag = nn_get_tag_values('subject',  a_doc_from_json.get_tokens_map_unchaged(), tagsmap, max_tokens=200, threshold=0.02, limit=1, return_single=True)

    print(subject_tag)

# Reporting

In [None]:

ev = None

if not USE_CONTROL_SET:
    ev =   umtm.stats.copy()
else:
    ev = test_meta.copy()
    
    
tags =          pd.DataFrame()
errors_report = pd.DataFrame()


if USE_CONTROL_SET:
    userdocs = ev
else:
    userdocs = ev[ev.unseen==False]    
    userdocs = userdocs[userdocs.source=='db']
    userdocs = userdocs[userdocs.score < 50000]
    
userdocs = userdocs[userdocs['valid']==True]


display(Markdown(f'#### {len(userdocs)} (userdocs) in total for evaluation'))   
mlflow.log_param('test set', len(userdocs))

if len(test_meta) != len(userdocs):
    mlflow.log_param('test set invalid', len(test_meta) - len(userdocs) )
display(Markdown(f'#### {len(test_meta) -  len(userdocs)} invalid docs'))   

In [None]:


import seaborn as sns
%matplotlib inline
try:
    cnt = userdocs['subject'].value_counts()

    plt.figure(figsize=(12, 6 ))
    sns.barplot(x=cnt.values, y=cnt.index)

    print(  cnt )

    plt.title(f'test: Frequency Distribution of subjects; {len(userdocs)} total')
    plt.xlabel('Number of Occurrences')
    
    plt.savefig( training_reports_path / 'Distribution of subjects -test.png', bbox_inches='tight', pad_inches=0)
    plt.show()

    mlflow.log_artifact(training_reports_path / 'Distribution of subjects -test.png')


except Exception as e:
    logger.exception(e)

In [None]:

for v in semantic_map_keys:
    for s in ['_expected', '_predicted']:
        userdocs[f'{v}{s}'] = ''
        userdocs[f'{v}{s}'] = userdocs[f'{v}{s}'].astype(str)
        
for v in semantic_keys_numeric:
    for s in ['_expected', '_predicted']:
        userdocs[f'{v}{s}'] = np.NaN
        userdocs[f'{v}{s}'] = userdocs[f'{v}{s}'].astype(float)
        


In [None]:
batch_size = 96
maxlen = 128 * 12


ac = AuditContext()

def interpret_prediction(_id, tagsmap, df):
    col_suffix="_predicted"
    
    doc = load_doc_as_table_row(documents_collection, _id, df)
    if doc is None: 
        return
    
    tokens = doc.get_tokens_map_unchaged()
    
    #---
    # ORGS ------------------
    orgs = nn_find_org_names(tokens, tagsmap, ac)
    if len(orgs)>0:
        o1 = orgs[0]
    else:
        o1={}
        
    if len(orgs)>1:
        o2 = orgs[1]
    else:
        o2={}
        
    for part in ['name', 'alias', 'type']:
        part_tag1 = getattr(o1, part, {})
        part_tag2 = getattr(o2, part, {})
        
        v1 = getattr(part_tag1, 'value', '').lower()
        v2 = getattr(part_tag2, 'value', '').lower()
#         vv= [v1,v2]
        vv=sorted([v1,v2])
        df.at[ _id, f'org-1-{part}{col_suffix}'] = vv[1] 
        df.at[ _id, f'org-2-{part}{col_suffix}'] = vv[0] 
 
    # PRICE ------------------
    cps = nn_find_contract_value(tokens, tagsmap)
    if cps:
        cps=cps[0]
        for v in t_semantic_map_keys_price:
            part = getattr(cps, v)
            if part:
                df.at[ _id, f'{v}{col_suffix}'] = part.value
                 
 
    # DATE NUMBER ------------------
    number_tag = nn_get_contract_number(tokens, tagsmap)
    date_tag =   nn_get_contract_date  (tokens, tagsmap) 
    
        
    if number_tag:
        df.at[ _id, f'number{col_suffix}'] = str(number_tag.value)
    else:
        df.at[ _id, f'number{col_suffix}'] = ''
        
        
    if date_tag:
        df.at[ _id, f'date{col_suffix}'] = str(date_tag.value)
    else:
        df.at[ _id, f'date{col_suffix}'] = ''
 


In [None]:
validation_set = userdocs

if TEST_FLOW:
    validation_set = userdocs[0:20]
    
    
for i in range(0, len(validation_set), batch_size):
    batch = userdocs[i:i+batch_size]
    actual_batch_size=len(batch)
    print(f'{actual_batch_size=}')
    batch_input_emb=[]
    batch_input_token_f=[]
    for _id in batch.index.values:
 
        dp = make_xyw(_id, userdocs)
        dp = UberModelTrainsetManager.trim_maxlen(dp, 0, maxlen  )
        (emb, tok_f), (sm, subj), (sample_weight, subject_weight) = dp

        batch_input_emb.append(emb)
        batch_input_token_f.append(tok_f)
        
        expected_subj =  decode_subj_prediction(subj)[0]
        userdocs.at[_id, 'subject_expected'] = expected_subj.name
 

    del _id
    del dp

    
    dps =  [np.array(batch_input_emb), np.array(batch_input_token_f)]
 
    
    predictions = umodel.predict(x=dps, batch_size=actual_batch_size)
    del batch_input_emb
    del batch_input_token_f
    
    p_tags = predictions[0]
    p_subj = predictions[1]
    for k,_id in enumerate(batch.index.values):
                   
        tagsmap = pd.DataFrame(p_tags[k], columns=semantic_map_keys_contract)
        interpret_prediction(_id, tagsmap, userdocs)
        
        subj_1hot = p_subj[k]

        predicted = decode_subj_prediction(subj_1hot)[0]
        userdocs.at[_id, 'subject_predicted'] = predicted.name
        
userdocs[
  ['org-1-type_expected', 'org-1-type_predicted', 'org-1-name_expected', 'org-1-name_predicted', 'org-1-alias_expected', 'org-1-alias_predicted', 
   'org-2-type_expected', 'org-2-type_predicted', 'org-2-name_expected', 'org-2-name_predicted', 'org-2-alias_expected', 'org-2-alias_predicted', 
   'subject_expected', 'subject_predicted', 'date_expected',
   'date_predicted', 'number_expected', 'number_predicted']].head(10)



### Clean predicted/expected values

In [None]:

columns = []
for v in semantic_keys_numeric:
    for s in ['_expected', '_predicted' ]:
        col = f'{v}{s}'
        userdocs[col] = userdocs[col].replace('',np.NaN).replace(np.NaN, -1).astype(int)
        columns.append(col)
        
userdocs['date_predicted'] = userdocs['date_predicted'].astype(str)        
userdocs['date_expected'] = userdocs['date_expected'].astype(str)        
# userdocs[columns]

In [None]:
userdocs=userdocs.replace('None', '')
userdocs=userdocs.replace('none', '')
userdocs=userdocs.replace('nan', '')

userdocs.date_expected =  userdocs.date_expected.replace(np.NaN, '')
userdocs.date_predicted = userdocs.date_predicted.replace(np.NaN, '')

userdocs.number_expected =  userdocs.number_expected.replace(np.NaN, '')
userdocs.number_predicted = userdocs.number_predicted.replace(np.NaN, '')

userdocs.vat_unit_expected =  userdocs.vat_unit_expected.replace(np.NaN, '')
userdocs.vat_unit_predicted = userdocs.vat_unit_predicted.replace(np.NaN, '')

userdocs.vat_expected =  userdocs.vat_expected.replace(np.NaN, '')
userdocs.vat_predicted = userdocs.vat_predicted.replace(np.NaN, '')

userdocs.sign_expected =  userdocs.sign_expected.replace(np.NaN, '')
userdocs.sign_predicted = userdocs.sign_predicted.replace(np.NaN, '')

userdocs.currency_expected =  userdocs.currency_expected.replace(np.NaN, '')
userdocs.currency_predicted = userdocs.currency_predicted.replace(np.NaN, '')

# userdocs[['date_expected', 'date_predicted', 'number_expected', 'number_predicted']]

In [None]:
def select_wrong(userdocs, key):
    s= userdocs[userdocs[f'{key}_expected'] != userdocs[f'{key}_predicted']][[f'{key}_expected', f'{key}_predicted']]
    p=float(len(s)) / len(userdocs)
    return s,p

# Tags report
## accuracy report

In [None]:
userdocs['number_of_errors'] = 0
report = pd.DataFrame()

n   = 0
avg = 0

for k in t_semantic_map_keys_common[1:] + t_semantic_map_keys_price + org_numbered_keys :
    s, p = select_wrong(userdocs, k)
    for _id, _ in s.iterrows():
       userdocs.at[_id, 'number_of_errors'] += 1
    
    acc = 1.0 - p
    report.at[k, 'accuracy']=f"{acc:.1%}"
    mlflow.log_metric(f"accuracy_{k}", acc)
    
    n += 1
    avg += acc
    
avg = avg / n

report.at['ALL', 'accuracy']=f"{avg:.1%}"
mlflow.log_metric("accuracy_TAGS", avg)
report.to_csv(training_reports_path /'attributes_accuracy.csv')    
mlflow.log_artifact(training_reports_path / 'attributes_accuracy.csv')

# 81.6%
report    

## Worst docs list

In [None]:
userdocs[['number_of_errors']].sort_values('number_of_errors', ascending=False)[:30]

# Subjects predictions reports

In [None]:
ev = userdocs.copy()

In [None]:
subj_pred = ev[pd.notna(ev.subject_predicted)][pd.notna(ev.subject_expected)]
subj_df = subj_pred[['subject_predicted', 'subject_expected']].copy()

In [None]:


labels = sorted(np.unique(subj_df['subject_expected'].values))

In [None]:
report = classification_report(subj_df.subject_expected, subj_df.subject_predicted, digits=3, output_dict=True)
report['weighted avg']
mlflow.log_metrics(report['weighted avg'])
mlflow.log_metric('subject_F1',report['weighted avg']['f1-score'])
mlflow.log_metric('subject_F1_support',report['weighted avg']['support'])
mlflow.log_metric('subject_precision',report['weighted avg']['precision'])
mlflow.log_metric('subject_recall',report['weighted avg']['recall'])

In [None]:


def make_report(umodel, subj_df):
  plot_cm(subj_df.subject_expected.values, subj_df.subject_predicted.values, figsize=(12, 12))
  
  img_path = training_reports_path / f'subjects-confusion-matrix-{umodel.name}.png'
  plt.savefig(img_path, bbox_inches='tight')
  mlflow.log_artifact(img_path)

  report = classification_report(subj_df.subject_expected, subj_df.subject_predicted, digits=3)
  print(umodel.name)
  print(report)

  fn = training_reports_path / f'subjects-classification_report-{umodel.name}.txt'
  with open(fn, "w") as text_file:
    text_file.write(report)
 
  mlflow.log_artifact(fn)


make_report(umodel, subj_df)

In [None]:
if sub_mlflow_run is not None:
    mlflow.end_run()