In [None]:
import logging
import platform
import sys
import os

import numpy as np

from collections import Counter


from IPython.display import display, Markdown
from datetime import datetime
 

logger = logging.getLogger('retrain_ipynb')
logger.setLevel(logging.DEBUG)


ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(levelname)s - %(asctime)s - %(name)s - %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
logger.debug('--=logging started=--')
  

In [None]:
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
import analyser.hyperparams 


### Imports...

In [None]:

from bson import ObjectId

import gpn_config
import mlflow

from pandas import DataFrame

from pathlib import Path

from tf_support.super_contract_model import get_semantic_map_new, \
        semantic_map_keys_contract, t_semantic_map_keys_common, t_semantic_map_keys_org, t_semantic_map_keys_price

from analyser.persistence import DbJsonDoc

    

if 'COLLECTION_NAME' in globals():
    COLLECTION_NAME = globals()['COLLECTION_NAME']
else:
    COLLECTION_NAME = 'documents'
print(f'documents {COLLECTION_NAME=}')
    

def get_doc_by_id (objid):
    db = get_mongodb_connection()
    documents_collection = db[COLLECTION_NAME]
    _id = ObjectId(str(objid))
    jdata = documents_collection.find_one({'_id': _id})
    return jdata

# Init ml flow

In [None]:

# mlflow.start_run(run_name='fetch trainset from db')
ml_flow_url = gpn_config.configured('MLFLOW_URL')
mlflow.set_tracking_uri(ml_flow_url)
print(f'{ml_flow_url=}', 'set MLFLOW_URL env var to re-define')

mlflow.set_experiment("–ü—Ä–æ–≤–µ—Ä–∫–∞ –∫–∞—á–µ—Å—Ç–≤–∞ —Ä–∞–∑–º–µ—Ç–∫–∏")
active_mlflow_run = mlflow.start_run(nested=True)


In [None]:

reports_path = analyser.hyperparams.reports_path
 

# ü§¶ Find markup errors

In [None]:
%%time
# del user_docs_ids

if 'errors_report_file_prefix' in globals():
    errors_report_file_prefix = globals()['errors_report_file_prefix']
else:
    errors_report_file_prefix = "user_markup_errors"

    
reports_fn = reports_path / f"{errors_report_file_prefix}.csv"    


    
if 'user_docs_ids'in globals():
    user_docs_ids = globals()['user_docs_ids']
    print('Number of contract IDs for validation (set ouside) is', len(user_docs_ids))
else:    
    print('Query DB for all user docs')
    
    from integration.db import get_mongodb_connection
    from pymongo import ASCENDING

    query = {
      '$and': [
        {"parse.documentType":{ '$in': ["AGREEMENT", "CONTRACT", "SUPPLEMENTARY_AGREEMENT"] }  },      
        {"user.attributes_tree": {"$ne": None}},
    #       {'user.updateDate': {'$gt': lastdate}}    
      ]
    }

    db = get_mongodb_connection()
    documents_collection = db['documents']
    sorting = [('analysis.analyze_timestamp', ASCENDING), ('user.updateDate', ASCENDING)]
    res = documents_collection.find(filter=query, 
                                    sort=sorting,
                                    projection={'_id': True, 'user.updateDate':True, 'state':True, 'parse.documentType':True}
    #                                             'analysis.attributes_tree.version': True,
    #                                             'analysis.attributes_tree.contract.subject': True}
                                   ).limit(5000)

    res = list([i for i in res])
    user_docs_ids  = [i["_id"] for i in res]
    display(Markdown(f"#### {len(res)} -- –í—Å–µ–≥–æ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤, –∫–æ—Ç–æ—Ä—ã–µ —Ä–∞–∑–º–µ—á–µ–Ω—ã —á–µ–ª–æ–≤–µ–∫–æ–º"))
    

In [None]:



def get_span_val(jd:DbJsonDoc, tag):
    tm = jd.get_tokens_map_unchaged()
    
    span = tag.get('span', [0,0]) 
    
    if span[1]-span[0]==0:
        return None
    
    quote = tm.text_range(span)
    
    return quote



def add_error(userdocs, _id, error_message_tuple ):    
            
    _errors = []
    
    if type(error_message_tuple) == list:
        _errors = error_message_tuple
    else:
        if error_message_tuple:
            _errors.append(error_message_tuple)
            
        
    for error_message_tuple in _errors:
        try:
            if len(error_message_tuple) == 3:
                error_message, missing, severity = error_message_tuple
            else:
                error_message =str(error_message_tuple)
                missing = False
                severity = 1

            userdocs.at[_id,'errors count'] = userdocs.at[_id,'errors count'] + 1
            userdocs.at[_id,'errors severity'] = userdocs.at[_id,'errors severity'] + severity

            if missing==True:
                userdocs.at[_id, 'error missing']=';\n'.join(  [error_message, userdocs.at[_id,'error missing']] )
            else:
                userdocs.at[_id, 'error']=';\n'.join(  [error_message, userdocs.at[_id,'error']] )
        except:
            print(error_message_tuple)
            raise(Exception(_id))

In [None]:



def validate_date(jd:DbJsonDoc, df, i=0) -> str or None:
    _id = str(jd._id)
    tag = jd.get_attributes_tree().get('date', {})
    val = get_span_val(jd, tag)
    
    if not val:
        return f"–î–∞—Ç–∞ –æ—Ç—Å—É—Ç—Å–≤—É–µ—Ç", True, 1

    
    df.at[_id, '–î–∞—Ç–∞'] = val 
    df.at[_id, 'Date val.'] = tag.get('value', None)
       
    
    if '\n' in val:
        return f"–î–∞—Ç–∞ —Å–æ–¥–µ—Ä–∂–∏—Ç –ø–µ—Ä–µ–Ω–æ—Å —Å—Ç—Ä–æ–∫–∏", False, 2
    
    if len(val) > 25:
        return f"–î–∞—Ç–∞ —Å–ª–∏—à–∫–æ–º –¥–ª–∏–Ω–Ω–∞—è", False, 4
    
    
def validate_number(jd:DbJsonDoc, df, i=0) -> str or None:
    _id = str(jd._id)
#     span = jd.get_attributes_tree().get('number', {}).get('span', [0,0])
#     val = jd.get_tokens_map_unchaged().text_range(span)
    
    tag = jd.get_attributes_tree().get('number', {})
    span = tag.get('span', [0,0]) 
    val = get_span_val(jd, tag)
    
    if not val:
        return f"–ù–æ–º–µ—Ä –æ—Ç—Å—É—Ç—Å–≤—É–µ—Ç", True, 1
    
    
    df.at[_id, f'–ù–æ–º–µ—Ä'] = val 
    
    if '–¥–æ–≥–æ–≤–æ—Ä' in val.lower():
        return f"–ù–æ–º–µ—Ä —Å–æ–¥–µ—Ä–∂–∏—Ç —Å–ª–æ–≤–æ", False, 3
    
    if '\n' in val:
        return f"–ù–æ–º–µ—Ä —Å–æ–¥–µ—Ä–∂–∏—Ç –ø–µ—Ä–µ–Ω–æ—Å —Å—Ç—Ä–æ–∫–∏", False, 2
       
    if val.strip() !=val:
        return f"–ù–æ–º–µ—Ä —Å–æ–¥–µ—Ä–∂–∏—Ç –ø—Ä–æ–±–µ–ª—ã –ø–æ –∫—Ä–∞—è–º", False, 2
    
    if val.strip()=="‚Ññ":
        return f"–ù–æ–º–µ—Ä –Ω–µ —è–≤–ª—è–µ—Ç—Å—è –Ω–æ–º–µ—Ä–æ–º", False, 3
    
    if (span[1]-span[0])>5:
        return f"–ù–æ–º–µ—Ä –¥–æ–≥–æ–≤–æ—Ä–∞ –∫–∞–∫–æ–π-—Ç–æ –¥–ª–∏–Ω–Ω—ã–π", False, 2

#     if val.strip()[0]=="‚Ññ":
#         return f"–ù–æ–º–µ—Ä —Å–æ–¥–µ—Ä–∂–∏—Ç –∑–Ω–∞–∫ ‚Ññ"
        
#     if len(val) > 30:
#         return f"–ù–æ–º–µ—Ä —Å–ª–∏—à–∫–æ–º –¥–ª–∏–Ω–Ω—ã–π"
    
    
def validate_alias(jd:DbJsonDoc, df, i=0) -> str or None:
    _id = str(jd._id)
    orgs=jd.get_attributes_tree().get('orgs', [{},{}])
    
    if len(orgs) < i+1:
        return f"–ü—Å–µ–≤–¥–æ–Ω–∏–º {i+1} –æ—Ç—Å—É—Ç—Å–≤—É–µ—Ç", True, 1
    
    tag =  orgs[i].get('alias',{})
    val = get_span_val(jd, tag)
    
    if not val:
        return f"–ü—Å–µ–≤–¥–æ–Ω–∏–º {i+1} –æ—Ç—Å—É—Ç—Å–≤—É–µ—Ç", True, 1
    
    
    df.at[_id,f'–ü—Å–µ–≤–¥–æ–Ω–∏–º {i+1}'] = val 
    
    if  '¬´' in val or '¬ª' in val:
        return f"–ü—Å–µ–≤–¥–æ–Ω–∏–º {i+1} —Å–æ–¥–µ—Ä–∂–∏—Ç –∫–∞–≤—ã—á–∫–∏", False, 2
    
    if '\n' in val:
        return f"–ü—Å–µ–≤–¥–æ–Ω–∏–º {i+1} —Å–æ–¥–µ—Ä–∂–∏—Ç –ø–µ—Ä–µ–Ω–æ—Å —Å—Ç—Ä–æ–∫–∏", False, 2
    
    if len(val) > 25:
        return f"–ü—Å–µ–≤–¥–æ–Ω–∏–º {i+1} —Å–ª–∏—à–∫–æ–º –¥–ª–∏–Ω–Ω—ã–π", False, 5
    
    
def validate_org_name(jd:DbJsonDoc, df, i=0) -> str or None:
    _id = str(jd._id)
    _prefix = '–ù–∞–∏–º–µ–Ω–æ–≤–∞–Ω–∏–µ'
    errors = list()
    
    orgs = jd.get_attributes_tree().get('orgs', [{},{}])
    
    if len(orgs) < i+1:
        return f"{_prefix} {i+1} –æ—Ç—Å—É—Ç—Å–≤—É–µ—Ç", True,1
    
    tag = orgs[i].get('name',{})
    val = get_span_val(jd, tag)
    
    if not val:
        return f"{_prefix} {i+1} –æ—Ç—Å—É—Ç—Å–≤—É–µ—Ç", True, 1
    
    df.at[_id,f'{_prefix} {i+1}'] = val 
    
    if '\n' in val:
        errors.append(( f"{_prefix} {i+1} —Å–æ–¥–µ—Ä–∂–∏—Ç –ø–µ—Ä–µ–Ω–æ—Å —Å—Ç—Ä–æ–∫–∏", False, 2))
    
    if '¬´' in val:
        if '¬ª' not in val:
            errors.append(( f"{_prefix} {i+1}: –∫–∞–≤—ã—á–∫–∞ –Ω–µ –∑–∞–∫—Ä—ã—Ç–∞", False, 3))
    
    if '¬ª' in val:
        if '¬´' not in val:
            errors.append(( f"{_prefix} {i+1}: –∫–∞–≤—ã—á–∫–∞ –Ω–µ –æ—Ç–∫—Ä—ã—Ç–∞", False, 3))
        
    if val[0]=='¬´':
        errors.append(( f"{_prefix} {i+1} —Å–æ–¥–µ—Ä–∂–∏—Ç –∫–∞–≤—ã—á–∫–∏", False, 2))

    return errors
    
    
def validate_org_type(jd:DbJsonDoc, df, i=0) -> str or None:
    _id = str(jd._id)
    
    _prefix = '–§–æ—Ä–º–∞ —Å–æ–±—Å—Ç–≤–µ–Ω–Ω–æ—Å—Ç–∏'
    errors = list()
        
        
    orgs=jd.get_attributes_tree().get('orgs', [{},{}])
    
   
    
    if len(orgs) < i+1:
        return f"{_prefix} {i+1} –æ—Ç—Å—É—Ç—Å–≤—É–µ—Ç", True, 1
    
    tag = orgs[i].get('type',{})
    if not tag:
        return f"{_prefix} {i+1} –æ—Ç—Å—É—Ç—Å–≤—É–µ—Ç", True, 1
    
    
    val = tag['value'] #  get_span_val(jd, tag)
    
    if not val:
        errors.append(( f"{_prefix} {i+1} –æ—Ç—Å—É—Ç—Å–≤—É–µ—Ç", True, 100))
    
    df.at[_id, f'{_prefix} {i+1}'] = val 
    
    if '\n' in val:
        errors.append((f"{_prefix} {i+1}: —Å–æ–¥–µ—Ä–∂–∏—Ç –ø–µ—Ä–µ–Ω–æ—Å —Å—Ç—Ä–æ–∫–∏", False, 2))
    
    if '¬´' in val:
        if '¬ª' not in val:
            errors.append(( f"{_prefix} {i+1}: –∫–∞–≤—ã—á–∫–∞ –Ω–µ –∑–∞–∫—Ä—ã—Ç–∞", False, 3))
    
    if '¬ª' in val:
        if '¬´' not in val:
            errors.append(( f"{_prefix} {i+1}: –∫–∞–≤—ã—á–∫–∞ –Ω–µ –æ—Ç–∫—Ä—ã—Ç–∞", False, 3))
        
    if val[0]=='¬´':
        errors.append(( f"{_prefix} {i+1}: —Å–æ–¥–µ—Ä–∂–∏—Ç –∫–∞–≤—ã—á–∫–∏", False, 3))
    
    return errors
    

def validate_subject_len(jd:DbJsonDoc, df) -> str or None: 
    _id = str(jd._id)
    
    tag = jd.get_attributes_tree().get('subject', {})
    val = get_span_val(jd, tag)
    if not val:
        return f"–ø—Ä–µ–¥–º–µ—Ç –æ—Ç—Å—É—Ç—Å–≤—É–µ—Ç", True, 10
    
    
    span = tag.get('span', [0,0]) 
    
    kind = tag.get('value', None) 
    
    
    subject_len = span[1]-span[0]
    
    df.at[_id,'subject len'] = subject_len
    df.at[_id,'subject kind'] = kind
    
    if subject_len > 150:
        df.at[_id,'subject'] = val[:200]
        return f"–ø—Ä–µ–¥–º–µ—Ç –¥–æ–≥–æ–≤–æ—Ä–∞ —Å–ª–∏—à–∫–æ–º –¥–ª–∏–Ω–Ω—ã–π: {subject_len} —Å–ª–æ–≤", False, 5
    
    if subject_len == 0:
        return f"—Ä–∞–∑–¥–µ–ª –æ –ø—Ä–µ–¥–º–µ—Ç–µ –¥–æ–≥–æ–≤–æ—Ä–∞ –Ω–µ —É–∫–∞–∑–∞–Ω; –æ–ø—Ä–µ–¥–µ–ª–µ–Ω –∫–∞–∫ [{kind}]", False, 10
    
    
def validate_price(jd:DbJsonDoc, df):    
    errors = list()
    
    _id = str(jd._id)
    
    tag = jd.get_attributes_tree().get('price', {})
    
    
    span = tag.get('span', [0,0]) 
        
    _len = span[1]-span[0]
    
#     df.at[_id,'subject len'] = subject_len

    
    
    if not tag:
        return f"—Å—É–º–º–∞ –¥–æ–≥–æ–≤–æ—Ä–∞ –æ—Ç—Å—É—Ç—Å–≤—É–µ—Ç", True, 1
    
    price_q = get_span_val(jd, tag)
    df.at[_id,'—Å—É–º–º–∞'] = f"{price_q}"
    
    
    sentence_span1 = jd.get_tokens_map_unchaged().sentence_at_index( span[0])
    sentence_span2 = jd.get_tokens_map_unchaged().sentence_at_index( span[1])
    sentence_span = [ sentence_span1[0], sentence_span2[1]]
    
    
    sentence = jd.get_tokens_map_unchaged().text_range(sentence_span)
    
    amount_name = '—Å—É–º–º–∞ (—Å—Ç–∞—Ä–∞—è)'
    price_name = '—Å—É–º–º–∞ –¥–æ–≥–æ–≤–æ—Ä–∞'
    
    
    vat = tag.get('vat')    
    if vat:
        val = get_span_val(jd, vat)    
        df.at[_id, '–Ω–∞–ª–æ–≥'] = f'{val}'
        
        
    vat_unit = tag.get('vat_unit')    
    if vat_unit:
        val = get_span_val(jd, vat_unit)    
        df.at[_id,'vat_unit'] = f'{val}'
        

    amount_netto = tag.get('amount_netto')    
    if amount_netto:
        val = get_span_val(jd, amount_netto)    
        df.at[_id,'—Å—É–º–º–∞ –±–µ–∑ –Ω–∞–ª–æ–≥–∞'] = f'{val}'
        
        
    amount = tag.get('amount')  
    if amount:
        val = get_span_val(jd, amount)    
        df.at[_id, amount_name] = f'{val}'
     
        
    amount_brutto = tag.get('amount_brutto')    
    if amount_brutto:
        val = get_span_val(jd, amount_brutto)    
        df.at[_id,'—Å—É–º–º–∞ —Å –Ω–∞–ª–æ–≥–æ–º'] = f'{val}'
        
        
    currency = tag.get('currency')    
    if currency:
        val = get_span_val(jd, currency)    
        df.at[_id,'currency'] = f'{val}'
        
    ## --- validation:
    
    if vat:
        val = get_span_val(jd, vat)    
        if not val.lstrip('-')[0].isdigit():
            errors.append((f"–Ω–∞–ª–æ–≥ –Ω–∞—á–∏–Ω–∞–µ—Ç—Å—è –Ω–µ —Å —Ü–∏—Ñ—Ä—ã", False, 100))
     
    #----
    if amount_netto:
        _span = amount_netto.get('span', [0,0])         
        if _span[1]-_span[0] > 4:
            errors.append((f"—Å—É–º–º–∞ –±–µ–∑ –Ω–∞–ª–æ–≥–∞ —Å–ª–∏—à–∫–æ–º –¥–ª–∏–Ω–Ω–∞—è, –¥–æ–ª–∂–Ω–∞ —Å–æ–¥–µ—Ä–∂–∞—Ç—å —Ç–æ–ª—å–∫–æ —á–∏—Å–ª–æ", False, 10))
    
    #----
    if amount_brutto:
        _span = amount_brutto.get('span', [0,0])         
        if _span[1]-_span[0] > 4:
            errors.append((f"—Å—É–º–º–∞ c –Ω–∞–ª–æ–≥–æ–º —Å–ª–∏—à–∫–æ–º –¥–ª–∏–Ω–Ω–∞—è, –¥–æ–ª–∂–Ω–∞ —Å–æ–¥–µ—Ä–∂–∞—Ç—å —Ç–æ–ª—å–∫–æ —á–∏—Å–ª–æ", False, 10))
        
    #----
    if amount:
        _span = amount.get('span', [0,0])         
        if _span[1]-_span[0] > 4:
            errors.append((f"{amount_name} —Å–ª–∏—à–∫–æ–º –¥–ª–∏–Ω–Ω–∞—è, –¥–æ–ª–∂–Ω–∞ —Å–æ–¥–µ—Ä–∂–∞—Ç—å —Ç–æ–ª—å–∫–æ —á–∏—Å–ª–æ", False, 10))
    
    
    if amount_brutto:
        if not vat:
            errors.append((f"–Ω–∞–ª–æ–≥ –Ω–µ —É–∫–∞–∑–∞–Ω", False, 20))
    

    if '—à—Ç—Ä–∞—Ñ' in sentence.lower() or "—Å—Ç—Ä–∞—Ö–æ–≤–∞—è —Å—É–º–º–∞" in sentence.lower() or " –ø–µ–Ω–∏ " in sentence.lower() : 
        print('-'*80)
        print(_id, '—Å—É–º–º–∞ –Ω–µ —Ç–∞!', 'sentence_span', sentence_span, jd.get_tokens_map_unchaged().text_range(sentence_span)[:200])
        errors.append((f"—Å—É–º–º–∞ –≤–æ–æ–±—â–µ –Ω–µ —Ç–∞!!", False, 100))
   
    if '–Ω–¥—Å –Ω–µ –æ–±–ª–∞–≥–∞–µ—Ç—Å—è' in sentence.lower() or '–Ω–¥—Å –Ω–µ —É–ø–ª–∞—á–∏–≤–∞–µ—Ç—Å—è' in sentence.lower():
        pass
    else:
        if '–Ω–¥—Å' in sentence.lower():
            if not (amount_netto or  amount_brutto):
                errors.append((f"—Å—É–º–º–∞ —Å —É—á–µ—Ç–æ–º –Ω–∞–ª–æ–≥–∞ –Ω–µ —É–∫–∞–∑–∞–Ω–∞", False, 1))
        
    if amount_netto or amount_brutto or amount:
        if not currency:
            errors.append((f"–≤–∞–ª—é—Ç–∞ –Ω–µ —É–∫–∞–∑–∞–Ω–∞", False, 30))
        
    if vat:
        if not vat_unit:
            errors.append((f"–≤–∞–ª—é—Ç–∞ –Ω–∞–ª–æ–≥–∞ –Ω–µ —É–∫–∞–∑–∞–Ω–∞", False, 40))
    
    if tag:
        if not (price_q.strip()[0].isalpha() or price_q.strip()[0].isdigit):
            errors.append((f"—Ä–∞–∑–¥–µ–ª –æ —Ü–µ–Ω–µ –Ω–∞—á–∏–Ω–∞–µ—Ç—Å—è –Ω–µ —Å–æ —Å–ª–æ–≤–∞", False, 4))
        
    if _len > 150:
        errors.append((f"{price_name} —Å–ª–∏—à–∫–æ–º –¥–ª–∏–Ω–Ω–∞—è: {_len} —Å–ª–æ–≤", False, 10))
    
    if _len < 10:
        errors.append((f"{price_name} —Å–ª–∏—à–∫–æ–º –∫–æ—Ä–æ—Ç–∫–∞—è: {_len} —Å–ª–æ–≤, –Ω—É–∂–Ω–æ: {sentence_span1[1]- sentence_span2[0]}", False, 10))
        
    return errors

####################################### sentence_at_index

def validate_markup(user_docs_ids) -> DataFrame:
  userdocs = DataFrame()
  userdocs['errors count']=0
  userdocs['errors severity']=0
  userdocs["–¥–∞—Ç–∞ —Ä–µ–¥–∞–∫—Ç–∏—Ä–æ–≤–∞–Ω–∏—è"]=None
  userdocs['–î–∞—Ç–∞']=None

  for k, oid in enumerate(user_docs_ids):
# for k, oid in enumerate(['5fe34f64b770574a005553e6']):    
    _id = str(oid)
    oid = ObjectId(_id)
    d = get_doc_by_id(oid)
    try:
        jd = DbJsonDoc(d)

        attr_tree = jd.get_attributes_tree()  
    #     print('',attr_tree)

        if jd.user:
            userdocs.at[_id,'–¥–∞—Ç–∞ —Ä–µ–¥–∞–∫—Ç–∏—Ä–æ–≤–∞–Ω–∏—è'] = jd.user['updateDate']
        else:
            userdocs.at[_id,'–¥–∞—Ç–∞ —Ä–µ–¥–∞–∫—Ç–∏—Ä–æ–≤–∞–Ω–∏—è'] = None
            
        userdocs.at[_id,'–¥–∞—Ç–∞ –∞–Ω–∞–ª–∏–∑–∞'] = jd.analysis['analyze_timestamp']
        userdocs.at[_id,'–≤–µ—Ä—Å–∏—è'] = jd.analysis['version']
            
        userdocs.at[_id,'filename']=jd.filename

        userdocs.at[_id,'link'] = f'https://gpn-audit.nemosoft.ru/#/audit/edit/{_id}'
        userdocs.at[_id,'error'] = ''
        userdocs.at[_id,'error missing'] = ''
        userdocs.at[_id,'errors count'] = 0
        userdocs.at[_id,'errors severity'] = 0



        if not jd.analysis:
            userdocs.at[_id, 'error'] = "–ù–µ –∞–Ω–∞–ª–∏–∑–∏—Ä–æ–≤–∞–Ω!!"

        else:
            add_error(userdocs, _id, validate_subject_len(jd, userdocs) )


            add_error(userdocs, _id, validate_org_type(jd, userdocs, 0) )
            add_error(userdocs, _id, validate_org_name(jd, userdocs, 0) )
            add_error(userdocs, _id, validate_alias(jd, userdocs, 0) )

            add_error(userdocs, _id, validate_org_type(jd, userdocs, 1) )
            add_error(userdocs, _id, validate_org_name(jd, userdocs, 1) )
            add_error(userdocs, _id, validate_alias(jd, userdocs, 1) )

            add_error(userdocs, _id, validate_date(jd, userdocs) )
            add_error(userdocs, _id, validate_number(jd, userdocs) )

            add_error(userdocs, _id, validate_price(jd, userdocs) )
    except Exception as e:
        add_error(userdocs, _id, str(e) )
        
  return userdocs.sort_values(['errors severity'], ascending=False)


In [None]:
# %%time

userdocs = validate_markup(user_docs_ids)

_s = f"#### {userdocs['errors count'].sum()} -- –≤—Å–µ–≥–æ –æ—à–∏–±–æ–∫/–Ω–µ–¥–æ—á–µ—Ç–æ–≤ —Ä–∞–∑–º–µ—Ç–∫–∏ –≤ {len(userdocs)} –¥–æ–∫—É–º–µ–Ω—Ç–∞—Ö -- ({userdocs['errors count'].sum()/len(userdocs)} –Ω–∞ –¥–æ–∫—É–º–µ–Ω—Ç)"
display(Markdown(_s))



# Finding statistical errors

In [None]:
VALIDATE_STATS = False

In [None]:


if VALIDATE_STATS:
    # userdocs = validate_markup(user_docs_ids)

    def find_statistical_errors(userdocs):
        l = list(userdocs['vat_unit'].values) + list( userdocs['currency'].values)
        l= [str(k).lower() for k in l]
        c = Counter(l)
        rare_currency=[]
        for k in c:
            if c[k] < 2:
                rare_currency.append(k)


        l = list(userdocs['–ü—Å–µ–≤–¥–æ–Ω–∏–º 1'].values) + list( userdocs['–ü—Å–µ–≤–¥–æ–Ω–∏–º 2'].values)
        l= [str(k).lower() for k in l]
        c = Counter(l)
        rare_aliases=[]
        for k in c:
            if c[k] < 2:
                rare_aliases.append(k)
        # rare_aliases


        l = list(userdocs['–§–æ—Ä–º–∞ —Å–æ–±—Å—Ç–≤–µ–Ω–Ω–æ—Å—Ç–∏ 1'].values) + list( userdocs['–§–æ—Ä–º–∞ —Å–æ–±—Å—Ç–≤–µ–Ω–Ω–æ—Å—Ç–∏ 2'].values)
        l= [str(k).lower() for k in l]
        c = Counter(l)
        rare_forms=[]
        for k in c:
            if c[k] < 2:
                rare_forms.append(k)
        # print(rare_forms)        



        for i, row in userdocs.iterrows():
            fs1 = str(row['–§–æ—Ä–º–∞ —Å–æ–±—Å—Ç–≤–µ–Ω–Ω–æ—Å—Ç–∏ 1']).lower()
            fs2 = str(row['–§–æ—Ä–º–∞ —Å–æ–±—Å—Ç–≤–µ–Ω–Ω–æ—Å—Ç–∏ 2']).lower()

            as1 = str(row['–ü—Å–µ–≤–¥–æ–Ω–∏–º 1']).lower()
            as2 = str(row['–ü—Å–µ–≤–¥–æ–Ω–∏–º 2']).lower()


            cs1 = str(row['vat_unit']).lower()
            if cs1 in rare_currency:
                print(i, f'[{cs1=}]')
                add_error(userdocs, i, "–í–∞–ª—é—Ç–∞ –Ω–∞–ª–æ–≥–∞ –æ—á–µ–Ω—å —Å—Ç—Ä–∞–Ω–Ω–∞—è" )

            cs2 = str(row['currency']).lower()
            if cs2 in rare_currency:
                print(i, f'[{cs2=}]')
                add_error(userdocs, i, "–í–∞–ª—é—Ç–∞ –æ—á–µ–Ω—å —Å—Ç—Ä–∞–Ω–Ω–∞—è" )

            if fs1 in rare_forms:
                print(i, f'[{fs1=}]')
                add_error(userdocs, i, "–§–æ—Ä–º–∞ —Å–æ–±—Å—Ç–≤–µ–Ω–Ω–æ—Å—Ç–∏ 1 –æ—á–µ–Ω—å —Å—Ç—Ä–∞–Ω–Ω–∞—è" )
            if fs2 in rare_forms:
                print(i, f'[{fs2=}]')
                add_error(userdocs, i, "–§–æ—Ä–º–∞ —Å–æ–±—Å—Ç–≤–µ–Ω–Ω–æ—Å—Ç–∏ 2 –æ—á–µ–Ω—å —Å—Ç—Ä–∞–Ω–Ω–∞—è" )

            if as1 in rare_aliases:
                print(i, f'[{as1=}]')
                add_error(userdocs, i, "–ü—Å–µ–≤–¥–æ–Ω–∏–º 1 –æ—á–µ–Ω—å —Å—Ç—Ä–∞–Ω–Ω—ã–π" )
            if as2 in rare_aliases:
                print(i, f'[{as2=}]')
                add_error(userdocs, i, "–ü—Å–µ–≤–¥–æ–Ω–∏–º 2 –æ—á–µ–Ω—å —Å—Ç—Ä–∞–Ω–Ω—ã–π" )

    find_statistical_errors(userdocs)            
    userdocs



In [None]:


if VALIDATE_STATS:

    userdocs_subj = userdocs[ ['subject kind', '–ü—Å–µ–≤–¥–æ–Ω–∏–º 1', '–ü—Å–µ–≤–¥–æ–Ω–∏–º 2']]
    # userdocs_subj

    def get_alias_pair(v):
        return ' -vs- '.join(   np.sort( list([ str(v['–ü—Å–µ–≤–¥–æ–Ω–∏–º 1']).lower(), str(v['–ü—Å–µ–≤–¥–æ–Ω–∏–º 2']).lower()])) ) 



    c=Counter([get_alias_pair(v) for i, v in userdocs_subj.iterrows()])
    rare_aliases_pairs=[k for k in c if c[k] < 2]
    for i, row in userdocs.iterrows():
        d=get_alias_pair(row)
        if d in rare_aliases_pairs:
            print(i, f'[{d}]')
            userdocs.at[i, 'strange alias combo'] =f"–ù–µ–æ–±—ã—á–Ω–∞—è –ø–∞—Ä–∞ –ø—Å–µ–≤–¥–æ–Ω–∏–º–æ–≤: {d}"

    print('-'*20)
    del rare_aliases_pairs
    # # rare_aliases_pairs

    # del d

    def get_alias_subject_pair(v, i):
        return ' / '.join(    list([ str(v[f'–ü—Å–µ–≤–¥–æ–Ω–∏–º {i}']).lower(), str(v['subject kind']).lower()])) 

    p1 = [get_alias_subject_pair(v, 1) for i, v in userdocs_subj.iterrows()]


    c=Counter(p1+[get_alias_subject_pair(v, 2) for i, v in userdocs_subj.iterrows()])
    rare_alias_subj_pairs=[k for k in c if c[k] < 3]

    for i, row in userdocs.iterrows():
        d1 = get_alias_subject_pair(row, 1)
        d2 = get_alias_subject_pair(row, 2)

        if d1 in rare_alias_subj_pairs:
            print(i, f'[{d1}]')
            userdocs.at[i, 'strange alias-subject'] =f"–ù–µ–æ–±—ã—á–Ω–∞—è –ø–∞—Ä–∞ –ø—Å–µ–≤–¥–æ–Ω–∏–º-–ø—Ä–µ–¥–º–µ—Ç: {d1}"

        if d2 in rare_alias_subj_pairs :
            print(i, f'[{d2}]')
            userdocs.at[i, 'strange alias-subject'] =f"–ù–µ–æ–±—ã—á–Ω–∞—è –ø–∞—Ä–∞ –ø—Å–µ–≤–¥–æ–Ω–∏–º-–ø—Ä–µ–¥–º–µ—Ç: {d2}"

    print('-'*20)        
    rare_alias_subj_pairs


In [None]:
if VALIDATE_STATS:
    _s = f"#### {datetime.today().strftime('%d.%m.%Y')} *–í—Å–µ–≥–æ* –æ—à–∏–±–æ–∫, –Ω–µ–¥–æ—á–µ—Ç–æ–≤ –∏ –ø—Ä–æ—á.: {userdocs['errors count'].sum()} –≤ {len(userdocs)} –¥–æ–∫—É–º–µ–Ω—Ç–∞—Ö -- ({userdocs['errors count'].sum()/len(userdocs):0.2f} –Ω–∞ –¥–æ–∫—É–º–µ–Ω—Ç)"
    display(Markdown(_s))

In [None]:
if VALIDATE_STATS:
    userdocs_subj = userdocs[ ['link','subject kind', '–ü—Å–µ–≤–¥–æ–Ω–∏–º 1', '–ü—Å–µ–≤–¥–æ–Ω–∏–º 2', 'strange alias combo', 'strange alias-subject' ]]

    errors_subjects = userdocs_subj[userdocs_subj['strange alias combo'].notnull() | userdocs_subj['strange alias-subject'].notnull() ]
    errors_subjects.to_csv(reports_path / "errors_subjects.csv", index=True)         
    errors_subjects

In [None]:
very_bad_docs = userdocs[userdocs['errors severity']>10]
very_bad_docs

# Save reports

In [None]:
# yseterday_userdocs = validate_markup(yseterday_ids)

# yseterday_userdocs['errors severity'] = yseterday_userdocs['errors severity'].astype('int')
# yseterday_userdocs['errors count'] = yseterday_userdocs['errors count'].astype('int')


userdocs['errors severity'] = userdocs['errors severity'].astype('int')
userdocs['errors count'] = userdocs['errors count'].astype('int')

In [None]:
if 'errors_report_metric_prefix' in globals():
    errors_report_metric_prefix = globals()['errors_report_metric_prefix']
else:
    errors_report_metric_prefix = "user"
 
userdocs=userdocs.sort_values(["errors severity", "errors count", "–¥–∞—Ç–∞ —Ä–µ–¥–∞–∫—Ç–∏—Ä–æ–≤–∞–Ω–∏—è"], ascending=False)
userdocs.to_csv(reports_fn, index=True)    
# userdocs

mlflow.log_metric(f'severity', userdocs['errors severity'].sum())
mlflow.log_metric(f'severity per doc', userdocs['errors severity'].sum()/len(userdocs))

mlflow.log_metric(f'errors', userdocs['errors count'].sum())
mlflow.log_metric(f'errors per doc', userdocs['errors count'].sum()/len(userdocs))

mlflow.log_metric(f'docs count',  len(userdocs))
mlflow.log_metric(f'severe docs',  len(very_bad_docs))
mlflow.log_param('subset code',  str(errors_report_metric_prefix))
mlflow.log_artifact(reports_fn)

 


# End mlflow logging

In [None]:

print(active_mlflow_run.info)

print('see results at')
print(f'{mlflow.get_registry_uri()}/#/experiments/{active_mlflow_run.info.experiment_id}/runs/{active_mlflow_run.info.run_id}')


mlflow.end_run()