Этот ноутбук - пример инференса модели. Он не содержит анализа, а просто представляет собой раннер. По факту он - клон класса модели из веб-части нашего решения. Позволяет быстро запустить и посмотреть на качество работы решения.

В ячейке ниже необходимо передать путь к весам, тестовым данным, папке для сохранения вывода (опционально, если будете запускать весь ноутбук). 
Ноутбук сохраняет следующие предсказания:
- `submission.csv` - Предсказание моделью вероятности дефолта подрядчика, содержит столбцы contract_id, report_date и score
- `explanations.json` - JSON файл с интерпретациями предсказанной вероятности из submission.csv. Используется для веб-части решения.
- `submission_final.csv` - CSV файл с интерпретацией (объединение submission.csv и explanations_descript.json). Содержит дополнительное поле interception

Учтите, что `submission.csv` считается достаточно быстро, но подсчёт интерпретаций может занять достаточно продолжительное время. Такое допущение было выбрано из расчёта, что, как правило, подсчёт интерпретаций должен производиться в ленивой форме, когда они понадобятся.

In [1]:
WEIGHTS_PATH = "./backend/model/weights"
TEST_DATAFRAME_PATH = "./test2_X.csv"
OUTPUT_DIR_PATH = "./output/"

In [2]:
import pickle
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm.auto import tqdm
import lime.lime_tabular
from typing import Any
import json
import holidays
import re
import warnings

tqdm.pandas()

# Игнорирование предупреждений
warnings.simplefilter('ignore')
warnings.filterwarnings("ignore")

# Установка начального значения для генератора случайных чисел
seed = 42
np.random.seed(seed)

In [3]:
# Some hardcoded column names used by the model
inference_nan_par = ['agg_scontrol__g_contractor__close_delay__defect_type_repair__mean__ALL_TIME',     'agg_scontrol__g_contractor__close_delay__defect_type_author_supervision__mean__ALL_TIME',     'agg_scontrol__g_contractor__close_delay__defect_type_GR__mean__ALL_TIME',     'agg_scontrol__g_contractor__close_delay__defect_type_labour_protection__mean__ALL_TIME',     'agg_scontrol__g_contractor__close_delay__defect_type_general_contractor__mean__ALL_TIME',     'agg_scontrol__g_contractor__close_delay__defect_type_arch_supervision__mean__ALL_TIME',     'agg_scontrol__g_contractor__close_delay__defect_type_tech_supervision__mean__ALL_TIME',     'agg_scontrol__g_contractor__close_delay__defect_type_app__mean__ALL_TIME',     'agg_sroomer__g_contractor__sroomer_id__count__3M',     'agg_sroomer__g_contractor__sroomer_id__count__6M',     'agg_sroomer__g_contractor__sroomer_id__count__12M',     'agg_sroomer__g_contractor__sroomer_id__count__ALL_TIME',     'agg_BoardOfDirectors__g_contractor__Name__count__ALL_TIME',     'agg_ConsolidatedIndicator__g_contractor__Index__Overall__mean__ALL_TIME',     'agg_ConsolidatedIndicator__g_contractor__Index__FailureScore__mean__ALL_TIME',     'agg_ConsolidatedIndicator__g_contractor__Index__PaymentIndex__mean__ALL_TIME',     'agg_ConsolidatedIndicator__g_contractor__Index__IndexOfDueDiligence__mean__ALL_TIME',     'agg_spark_extended_report__g_contractor__EstimatedClaimsSum__last__ALL_TIME',     'agg_spark_extended_report__g_contractor__EstimatedLiabilitiesSum__last__ALL_TIME',     'agg_spark_extended_report__g_contractor__EstimatedNetLiabilitiesSum__last__ALL_TIME',     'agg_spark_extended_report__g_contractor__PledgeeActiveCount__last__ALL_TIME',     'agg_spark_extended_report__g_contractor__PledgeeCeasedCount__last__ALL_TIME',     'agg_spark_extended_report__g_contractor__PledgerActiveCount__last__ALL_TIME',     'agg_spark_extended_report__g_contractor__PledgerCeasedCount__last__ALL_TIME',     'agg_spark_extended_report__g_contractor__CompanySizeRevenue__last__ALL_TIME',     'agg_spark_extended_report__g_contractor__CreditLimitSum__last__ALL_TIME',     'agg_Finance__g_contractor__Value__FixedAssets__last__ALL_TIME',     'agg_Finance__g_contractor__Value__CurrentAssets__last__ALL_TIME',     'agg_Finance__g_contractor__Value__Capital__last__ALL_TIME',     'agg_Finance__g_contractor__Value__LongLiabilities__last__ALL_TIME',     'agg_Finance__g_contractor__Value__ShortLiabilities__last__ALL_TIME',     'agg_Finance__g_contractor__Value__Balance__last__ALL_TIME',     'agg_Finance__g_contractor__Value__Revenue_y__last__ALL_TIME',     'agg_Finance__g_contractor__Value__NetProfit_y__last__ALL_TIME',     'agg_Finance__g_contractor__Value__CostPrice_y__last__ALL_TIME',     'agg_FinanceAndTaxesFTS__g_contractor__Expenses__last__ALL_TIME',     'agg_FinanceAndTaxesFTS__g_contractor__Income__last__ALL_TIME',     'agg_FinanceAndTaxesFTS__g_contractor__TaxArrearsSum__last__ALL_TIME',     'agg_FinanceAndTaxesFTS__g_contractor__TaxesSum__last__ALL_TIME',     'agg_ArbitrationCases__g_contractor__DefendantSum__sum__12M',     'agg_ArbitrationCases__g_contractor__DefendantSum__sum__12_24M',     'agg_ArbitrationCases__g_contractor__DefendantSum__sum__12_36M',     'agg_ArbitrationCases__g_contractor__DefendantSum__sum__12_48M',     'agg_ArbitrationCases__g_contractor__DefendantSum__sum__ALL_TIME',     'agg_ArbitrationCases__g_contractor__PlaintiffSum__sum__12M',     'agg_ArbitrationCases__g_contractor__PlaintiffSum__sum__12_24M',     'agg_ArbitrationCases__g_contractor__PlaintiffSum__sum__12_36M',     'agg_ArbitrationCases__g_contractor__PlaintiffSum__sum__12_48M',     'agg_ArbitrationCases__g_contractor__PlaintiffSum__sum__ALL_TIME',     'agg_tender_proposal__g_contractor__id__ALL__countDistinct__1W',     'agg_tender_proposal__g_contractor__id__ALL__countDistinct__2W',     'agg_tender_proposal__g_contractor__id__ALL__countDistinct__4W',     'agg_tender_proposal__g_contractor__id__ALL__countDistinct__8W',     'agg_tender_proposal__g_contractor__id__ALL__countDistinct__12W',     'agg_tender_proposal__g_contractor__id__ALL__countDistinct__26W',     'agg_tender_proposal__g_contractor__id__ALL__countDistinct__52W',     'agg_tender_proposal__g_contractor__id__ALL__countDistinct__ALL_TIME']
inference_data_del_par = ['agg_all_contracts__g_contract__abs_change_price_last_ds__isMain__last__ALL_TIME',     'agg_all_contracts__g_contract__abs_change_price_last_ds__isMain__mean__ALL_TIME',     'agg_all_contracts__g_contract__rel_change_price_last_ds__isMain__last__ALL_TIME',     'agg_all_contracts__g_contract__rel_change_price_last_ds__isMain__mean__ALL_TIME',     'agg_FinanceAndTaxesFTS__g_contractor__TaxPenaltiesSum__last__ALL_TIME']
inference_final_cols = ['contract_sum_change_percentage',  'contract_current_sum',  'Distance_to_contractor_mean',  'Distance_to_contractor_sum',  'counteragent_mean_agg_ks2__g_contract__total_sum__all__sum__ALL_TIME',  'total_claims_last_12_months',  'agg_spass_applications__g_contract__appl_count_week__mean__ALL_TIME',  'specialization_sum_agg_payments__g_contract__sum__all__sum__ALL_TIME',  'agg_spark_extended_report__g_contractor__CreditLimitSum__last__ALL_TIME',  'agg_materials__g_contract__material_id__countDistinct__ALL_TIME',  'agg_spass_applications__g_contract__appl_count_week__mean__26W',  'claims_change_12_24',  'reliability_index',  'Distance_from_contractor_median',  'spring',  'agg_payments__g_contract__sum__all__sum__ALL_TIME',  'agg_ArbitrationCases__g_contractor__DefendantSum__sum__ALL_TIME',  'contract_sum_change',  'agg_scontrol__g_contractor__close_delay__defect_type_tech_supervision__mean__ALL_TIME',  'counteragent_sum_agg_cec_requests__g_contract__total_sum_accepted__all__sum__ALL_TIME',  'agg_ConsolidatedIndicator__g_contractor__Index__Overall__mean__ALL_TIME',  'agg_materials__g_contract__order_id__countDistinct__ALL_TIME',  'agg_scontrol__g_contractor__close_delay__defect_type_labour_protection__mean__ALL_TIME',  'debt_to_equity',  'agg_Finance__g_contractor__Value__Capital__last__ALL_TIME',  'Distance_from_contractor_count',  'winter',  'agg_cec_requests__g_contract__total_sum_accepted__all__sum__ALL_TIME',  'agg_ArbitrationCases__g_contractor__DefendantSum__sum__12_36M',  'agg_ArbitrationCases__g_contractor__DefendantSum__sum__12_24M',  'agg_payments__g_contract__sum__all__sum__12W',  'building_id_915',  'agg_ArbitrationCases__g_contractor__PlaintiffSum__sum__12M',  'agg_spark_extended_report__g_contractor__EstimatedLiabilitiesSum__last__ALL_TIME',  'agg_scontrol__g_contractor__close_delay__defect_type_general_contractor__mean__ALL_TIME',  'agg_payments__g_contract__sum__all__countDistinct__ALL_TIME',  'agg_Finance__g_contractor__Value__LongLiabilities__last__ALL_TIME',  'agg_cec_requests__g_contract__request_id__all__count__ALL_TIME',  'claims_change_24_36',  'agg_ks2__g_contract__total_sum__all__sum__ALL_TIME',  'project_id_34',  'weekends',  'agg_FinanceAndTaxesFTS__g_contractor__TaxArrearsSum__last__ALL_TIME',  'project_id_49',  'current_ratio',  'pledger_activity',  'project_id_47',  'total_claims_last_24_months',  'specialization_id_12',  'project_id_45',  'holidays',  'profit_change',  'agg_cec_requests__g_contract__request_id__all__count__12W',  'agg_scontrol__g_contractor__close_delay__defect_type_app__mean__ALL_TIME',  'specialization_id_9',  'project_id_13',  'agg_cec_requests__g_contract__time_btw_requests__all__mean__4M',  'agg_sroomer__g_contractor__sroomer_id__count__3M',  'specialization_id_20',  'project_id_25',  'specialization_id_14',  'agg_ks2__g_contract__id__all__count__8W',  'project_id_42',  'project_id_44',  'specialization_id_21',  'contractor_id_888',  'specialization_id_3',  'building_id_361',  'building_id_448',  'building_id_414',  'building_id_345',  'building_id_727',  'building_id_679',  'contractor_id_280',  'building_id_470',  'building_id_701',  'project_id_36',  'building_id_9',  'contractor_id_368',  'building_id_804',  'building_id_403',  'contractor_id_213',  'contractor_id_223',  'contractor_id_860',  'contractor_id_246',  'building_id_805',  'contractor_id_42',  'specialization_id_2',  'counteragent_sum_agg_ks2__g_contract__total_sum__all__sum__ALL_TIME',  'contractor_id_97',  'contractor_id_206',  'contractor_id_366',  'contractor_id_851',  'contractor_id_47',  'contractor_id_218',  'agg_materials__g_contract__order_id__countDistinct__8W',  'contractor_id_745',  'contractor_id_793',  'contractor_id_63',  'contractor_id_282']
DATA_ALLOWED_COLUMNS = ['contract_id', 'report_date', 'specialization_id', 'contract_init_sum', 'contract_date', 'project_id', 'building_id', 'contractor_id', 'contract_current_sum', 'agg_all_contracts__g_contract__bit_da_guid__isMain__count__ALL_TIME', 'agg_all_contracts__g_contract__abs_change_price_last_ds__isMain__last__ALL_TIME', 'agg_all_contracts__g_contract__abs_change_price_last_ds__isMain__mean__ALL_TIME', 'agg_all_contracts__g_contract__rel_change_price_last_ds__isMain__last__ALL_TIME', 'agg_all_contracts__g_contract__rel_change_price_last_ds__isMain__mean__ALL_TIME', 'agg_cec_requests__g_contract__request_id__all__count__1W', 'agg_cec_requests__g_contract__request_id__all__count__2W', 'agg_cec_requests__g_contract__request_id__all__count__3W', 'agg_cec_requests__g_contract__request_id__all__count__4W', 'agg_cec_requests__g_contract__request_id__all__count__5W', 'agg_cec_requests__g_contract__request_id__all__count__6W', 'agg_cec_requests__g_contract__request_id__all__count__7W', 'agg_cec_requests__g_contract__request_id__all__count__8W', 'agg_cec_requests__g_contract__request_id__all__count__12W', 'agg_cec_requests__g_contract__request_id__all__count__ALL_TIME', 'counteragent_sum_agg_cec_requests__g_contract__request_id__all__count__ALL_TIME', 'agg_cec_requests__g_contract__total_sum_accepted__all__sum__1W', 'agg_cec_requests__g_contract__total_sum_accepted__all__sum__2W', 'agg_cec_requests__g_contract__total_sum_accepted__all__sum__3W', 'agg_cec_requests__g_contract__total_sum_accepted__all__sum__4W', 'agg_cec_requests__g_contract__total_sum_accepted__all__sum__5W', 'agg_cec_requests__g_contract__total_sum_accepted__all__sum__6W', 'agg_cec_requests__g_contract__total_sum_accepted__all__sum__7W', 'agg_cec_requests__g_contract__total_sum_accepted__all__sum__8W', 'agg_cec_requests__g_contract__total_sum_accepted__all__sum__12W', 'agg_cec_requests__g_contract__total_sum_accepted__all__sum__ALL_TIME', 'counteragent_sum_agg_cec_requests__g_contract__total_sum_accepted__all__sum__ALL_TIME', 'agg_cec_requests__g_contract__time_btw_requests__all__mean__1M', 'agg_cec_requests__g_contract__time_btw_requests__all__mean__2M', 'agg_cec_requests__g_contract__time_btw_requests__all__mean__3M', 'agg_cec_requests__g_contract__time_btw_requests__all__mean__4M', 'agg_cec_requests__g_contract__time_btw_requests__all__mean__5M', 'agg_cec_requests__g_contract__time_btw_requests__all__mean__6M', 'agg_cec_requests__g_contract__time_btw_requests__all__mean__7M', 'agg_cec_requests__g_contract__time_btw_requests__all__mean__8M', 'agg_cec_requests__g_contract__time_btw_requests__all__mean__12M', 'agg_cec_requests__g_contract__time_btw_requests__all__mean__ALL_TIME', 'agg_cec_requests__g_contract__created_dt__all__min__ALL_TIME', 'agg_cec_requests__g_contract__created_dt__accepted__min__ALL_TIME', 'agg_payments__g_contract__sum__all__countDistinct__1W', 'agg_payments__g_contract__sum__all__countDistinct__2W', 'agg_payments__g_contract__sum__all__countDistinct__4W', 'agg_payments__g_contract__sum__all__countDistinct__8W', 'agg_payments__g_contract__sum__all__countDistinct__12W', 'agg_payments__g_contract__sum__all__countDistinct__ALL_TIME', 'agg_payments__g_contract__sum__all__sum__1W', 'agg_payments__g_contract__sum__all__sum__2W', 'agg_payments__g_contract__sum__all__sum__4W', 'agg_payments__g_contract__sum__all__sum__8W', 'agg_payments__g_contract__sum__all__sum__12W', 'agg_payments__g_contract__sum__all__sum__ALL_TIME', 'counteragent_mean_agg_payments__g_contract__sum__all__sum__ALL_TIME', 'counteragent_sum_agg_payments__g_contract__sum__all__sum__ALL_TIME', 'specialization_sum_agg_payments__g_contract__sum__all__sum__ALL_TIME', 'agg_payments__g_contract__date__advance__min__ALL_TIME', 'agg_ks2__g_contract__id__all__count__1W', 'agg_ks2__g_contract__id__all__count__2W', 'agg_ks2__g_contract__id__all__count__4W', 'agg_ks2__g_contract__id__all__count__8W', 'agg_ks2__g_contract__id__all__count__12W', 'agg_ks2__g_contract__id__all__count__ALL_TIME', 'agg_ks2__g_contract__total_sum__all__sum__1W', 'agg_ks2__g_contract__total_sum__all__sum__2W', 'agg_ks2__g_contract__total_sum__all__sum__4W', 'agg_ks2__g_contract__total_sum__all__sum__8W', 'agg_ks2__g_contract__total_sum__all__sum__12W', 'agg_ks2__g_contract__total_sum__all__sum__ALL_TIME', 'counteragent_mean_agg_ks2__g_contract__total_sum__all__sum__ALL_TIME', 'counteragent_sum_agg_ks2__g_contract__total_sum__all__sum__ALL_TIME', 'specialization_sum_agg_ks2__g_contract__total_sum__all__sum__ALL_TIME', 'agg_spass_applications__g_contract__appl_count_week__max__ALL_TIME', 'agg_spass_applications__g_contract__appl_count_week__mean__1W', 'agg_spass_applications__g_contract__appl_count_week__mean__2W', 'agg_spass_applications__g_contract__appl_count_week__mean__3W', 'agg_spass_applications__g_contract__appl_count_week__mean__4W', 'agg_spass_applications__g_contract__appl_count_week__mean__5W', 'agg_spass_applications__g_contract__appl_count_week__mean__6W', 'agg_spass_applications__g_contract__appl_count_week__mean__8W', 'agg_spass_applications__g_contract__appl_count_week__mean__12W', 'agg_spass_applications__g_contract__appl_count_week__mean__26W', 'agg_spass_applications__g_contract__appl_count_week__mean__ALL_TIME', 'agg_spass_applications__g_specialization__appl_count_week__mean__ALL_TIME', 'counteragent_mean_agg_spass_applications__g_contract__appl_count_week__mean__ALL_TIME', 'agg_workers__g_contract__fact_workers__all__mean__1W', 'agg_workers__g_contract__fact_workers__all__mean__2W', 'agg_workers__g_contract__fact_workers__all__mean__3W', 'agg_workers__g_contract__fact_workers__all__mean__4W', 'agg_workers__g_contract__fact_workers__all__mean__5W', 'agg_workers__g_contract__fact_workers__all__mean__6W', 'agg_workers__g_contract__fact_workers__all__mean__8W', 'agg_workers__g_contract__fact_workers__all__mean__12W', 'agg_workers__g_contract__fact_workers__all__mean__26W', 'agg_workers__g_contract__fact_workers__all__mean__ALL_TIME', 'agg_materials__g_contract__order_id__countDistinct__1W', 'agg_materials__g_contract__order_id__countDistinct__2W', 'agg_materials__g_contract__order_id__countDistinct__4W', 'agg_materials__g_contract__order_id__countDistinct__8W', 'agg_materials__g_contract__order_id__countDistinct__12W', 'agg_materials__g_contract__order_id__countDistinct__ALL_TIME', 'agg_materials__g_contract__order_dt__min__ALL_TIME', 'agg_materials__g_contract__material_type_id__countDistinct__ALL_TIME', 'agg_materials__g_contract__material_id__countDistinct__ALL_TIME', 'agg_scontrol__g_contractor__close_delay__defect_type_repair__mean__ALL_TIME', 'agg_scontrol__g_contractor__close_delay__defect_type_author_supervision__mean__ALL_TIME', 'agg_scontrol__g_contractor__close_delay__defect_type_GR__mean__ALL_TIME', 'agg_scontrol__g_contractor__close_delay__defect_type_labour_protection__mean__ALL_TIME', 'agg_scontrol__g_contractor__close_delay__defect_type_general_contractor__mean__ALL_TIME', 'agg_scontrol__g_contractor__close_delay__defect_type_arch_supervision__mean__ALL_TIME', 'agg_scontrol__g_contractor__close_delay__defect_type_tech_supervision__mean__ALL_TIME', 'agg_scontrol__g_contractor__close_delay__defect_type_app__mean__ALL_TIME', 'agg_sroomer__g_contractor__sroomer_id__count__3M', 'agg_sroomer__g_contractor__sroomer_id__count__6M', 'agg_sroomer__g_contractor__sroomer_id__count__12M', 'agg_sroomer__g_contractor__sroomer_id__count__ALL_TIME', 'agg_BoardOfDirectors__g_contractor__Name__count__ALL_TIME', 'agg_ConsolidatedIndicator__g_contractor__Index__Overall__mean__ALL_TIME', 'agg_ConsolidatedIndicator__g_contractor__Index__FailureScore__mean__ALL_TIME', 'agg_ConsolidatedIndicator__g_contractor__Index__PaymentIndex__mean__ALL_TIME', 'agg_ConsolidatedIndicator__g_contractor__Index__IndexOfDueDiligence__mean__ALL_TIME', 'agg_spark_extended_report__g_contractor__EstimatedClaimsSum__last__ALL_TIME', 'agg_spark_extended_report__g_contractor__EstimatedLiabilitiesSum__last__ALL_TIME', 'agg_spark_extended_report__g_contractor__EstimatedNetLiabilitiesSum__last__ALL_TIME', 'agg_spark_extended_report__g_contractor__PledgeeActiveCount__last__ALL_TIME', 'agg_spark_extended_report__g_contractor__PledgeeCeasedCount__last__ALL_TIME', 'agg_spark_extended_report__g_contractor__PledgerActiveCount__last__ALL_TIME', 'agg_spark_extended_report__g_contractor__PledgerCeasedCount__last__ALL_TIME', 'agg_spark_extended_report__g_contractor__CompanySizeRevenue__last__ALL_TIME', 'agg_spark_extended_report__g_contractor__CreditLimitSum__last__ALL_TIME', 'agg_Finance__g_contractor__Value__FixedAssets__last__ALL_TIME', 'agg_Finance__g_contractor__Value__CurrentAssets__last__ALL_TIME', 'agg_Finance__g_contractor__Value__Capital__last__ALL_TIME', 'agg_Finance__g_contractor__Value__LongLiabilities__last__ALL_TIME', 'agg_Finance__g_contractor__Value__ShortLiabilities__last__ALL_TIME', 'agg_Finance__g_contractor__Value__Balance__last__ALL_TIME', 'agg_Finance__g_contractor__Value__Revenue_y__last__ALL_TIME', 'agg_Finance__g_contractor__Value__NetProfit_y__last__ALL_TIME', 'agg_Finance__g_contractor__Value__CostPrice_y__last__ALL_TIME', 'agg_FinanceAndTaxesFTS__g_contractor__Expenses__last__ALL_TIME', 'agg_FinanceAndTaxesFTS__g_contractor__Income__last__ALL_TIME', 'agg_FinanceAndTaxesFTS__g_contractor__TaxArrearsSum__last__ALL_TIME', 'agg_FinanceAndTaxesFTS__g_contractor__TaxPenaltiesSum__last__ALL_TIME', 'agg_FinanceAndTaxesFTS__g_contractor__TaxesSum__last__ALL_TIME', 'agg_ArbitrationCases__g_contractor__DefendantSum__sum__12M', 'agg_ArbitrationCases__g_contractor__DefendantSum__sum__12_24M', 'agg_ArbitrationCases__g_contractor__DefendantSum__sum__12_36M', 'agg_ArbitrationCases__g_contractor__DefendantSum__sum__12_48M', 'agg_ArbitrationCases__g_contractor__DefendantSum__sum__ALL_TIME', 'agg_ArbitrationCases__g_contractor__PlaintiffSum__sum__12M', 'agg_ArbitrationCases__g_contractor__PlaintiffSum__sum__12_24M', 'agg_ArbitrationCases__g_contractor__PlaintiffSum__sum__12_36M', 'agg_ArbitrationCases__g_contractor__PlaintiffSum__sum__12_48M', 'agg_ArbitrationCases__g_contractor__PlaintiffSum__sum__ALL_TIME', 'agg_tender_proposal__g_contractor__id__ALL__countDistinct__1W', 'agg_tender_proposal__g_contractor__id__ALL__countDistinct__2W', 'agg_tender_proposal__g_contractor__id__ALL__countDistinct__4W', 'agg_tender_proposal__g_contractor__id__ALL__countDistinct__8W', 'agg_tender_proposal__g_contractor__id__ALL__countDistinct__12W', 'agg_tender_proposal__g_contractor__id__ALL__countDistinct__26W', 'agg_tender_proposal__g_contractor__id__ALL__countDistinct__52W', 'agg_tender_proposal__g_contractor__id__ALL__countDistinct__ALL_TIME']
graph_del_col = ['agg_BoardOfDirectors__g_contractor__Name__count__ALL_TIME',     'agg_cec_requests__g_contract__time_btw_requests__all__mean__1M',     'agg_cec_requests__g_contract__time_btw_requests__all__mean__2M',     'agg_cec_requests__g_contract__time_btw_requests__all__mean__3M',     'agg_cec_requests__g_contract__time_btw_requests__all__mean__4M',     'agg_cec_requests__g_contract__time_btw_requests__all__mean__5M',     'agg_cec_requests__g_contract__time_btw_requests__all__mean__6M',     'agg_cec_requests__g_contract__time_btw_requests__all__mean__7M',     'agg_cec_requests__g_contract__time_btw_requests__all__mean__8M',     'agg_cec_requests__g_contract__time_btw_requests__all__mean__12M',     'agg_cec_requests__g_contract__time_btw_requests__all__mean__ALL_TIME',     'agg_cec_requests__g_contract__created_dt__all__min__ALL_TIME',     'agg_cec_requests__g_contract__created_dt__accepted__min__ALL_TIME',     'agg_tender_proposal__g_contractor__id__ALL__countDistinct__1W',     'agg_tender_proposal__g_contractor__id__ALL__countDistinct__2W',     'agg_tender_proposal__g_contractor__id__ALL__countDistinct__4W',     'agg_tender_proposal__g_contractor__id__ALL__countDistinct__8W',     'agg_tender_proposal__g_contractor__id__ALL__countDistinct__26W',     'agg_tender_proposal__g_contractor__id__ALL__countDistinct__52W',     'agg_tender_proposal__g_contractor__id__ALL__countDistinct__ALL_TIME']
graph_x_cols = ['specialization_id',  'contract_init_sum',  'contract_current_sum',  'agg_all_contracts__g_contract__bit_da_guid__isMain__count__ALL_TIME',  'agg_cec_requests__g_contract__request_id__all__count__1W',  'agg_cec_requests__g_contract__request_id__all__count__2W',  'agg_cec_requests__g_contract__request_id__all__count__3W',  'agg_cec_requests__g_contract__request_id__all__count__4W',  'agg_cec_requests__g_contract__request_id__all__count__5W',  'agg_cec_requests__g_contract__request_id__all__count__6W',  'agg_cec_requests__g_contract__request_id__all__count__7W',  'agg_cec_requests__g_contract__request_id__all__count__8W',  'agg_cec_requests__g_contract__request_id__all__count__12W',  'agg_cec_requests__g_contract__request_id__all__count__ALL_TIME',  'counteragent_sum_agg_cec_requests__g_contract__request_id__all__count__ALL_TIME',  'agg_cec_requests__g_contract__total_sum_accepted__all__sum__1W',  'agg_cec_requests__g_contract__total_sum_accepted__all__sum__2W',  'agg_cec_requests__g_contract__total_sum_accepted__all__sum__3W',  'agg_cec_requests__g_contract__total_sum_accepted__all__sum__4W',  'agg_cec_requests__g_contract__total_sum_accepted__all__sum__5W',  'agg_cec_requests__g_contract__total_sum_accepted__all__sum__6W',  'agg_cec_requests__g_contract__total_sum_accepted__all__sum__7W',  'agg_cec_requests__g_contract__total_sum_accepted__all__sum__8W',  'agg_cec_requests__g_contract__total_sum_accepted__all__sum__12W',  'agg_cec_requests__g_contract__total_sum_accepted__all__sum__ALL_TIME',  'counteragent_sum_agg_cec_requests__g_contract__total_sum_accepted__all__sum__ALL_TIME',  'agg_payments__g_contract__sum__all__countDistinct__1W',  'agg_payments__g_contract__sum__all__countDistinct__2W',  'agg_payments__g_contract__sum__all__countDistinct__4W',  'agg_payments__g_contract__sum__all__countDistinct__8W',  'agg_payments__g_contract__sum__all__countDistinct__12W',  'agg_payments__g_contract__sum__all__countDistinct__ALL_TIME',  'agg_payments__g_contract__sum__all__sum__1W',  'agg_payments__g_contract__sum__all__sum__2W',  'agg_payments__g_contract__sum__all__sum__4W',  'agg_payments__g_contract__sum__all__sum__8W',  'agg_payments__g_contract__sum__all__sum__12W',  'agg_payments__g_contract__sum__all__sum__ALL_TIME',  'counteragent_mean_agg_payments__g_contract__sum__all__sum__ALL_TIME',  'counteragent_sum_agg_payments__g_contract__sum__all__sum__ALL_TIME',  'specialization_sum_agg_payments__g_contract__sum__all__sum__ALL_TIME',  'agg_payments__g_contract__date__advance__min__ALL_TIME',  'agg_ks2__g_contract__id__all__count__1W',  'agg_ks2__g_contract__id__all__count__2W',  'agg_ks2__g_contract__id__all__count__4W',  'agg_ks2__g_contract__id__all__count__8W',  'agg_ks2__g_contract__id__all__count__12W',  'agg_ks2__g_contract__id__all__count__ALL_TIME',  'agg_ks2__g_contract__total_sum__all__sum__1W',  'agg_ks2__g_contract__total_sum__all__sum__2W',  'agg_ks2__g_contract__total_sum__all__sum__4W',  'agg_ks2__g_contract__total_sum__all__sum__8W',  'agg_ks2__g_contract__total_sum__all__sum__12W',  'agg_ks2__g_contract__total_sum__all__sum__ALL_TIME',  'counteragent_mean_agg_ks2__g_contract__total_sum__all__sum__ALL_TIME',  'counteragent_sum_agg_ks2__g_contract__total_sum__all__sum__ALL_TIME',  'specialization_sum_agg_ks2__g_contract__total_sum__all__sum__ALL_TIME',  'agg_spass_applications__g_contract__appl_count_week__max__ALL_TIME',  'agg_spass_applications__g_contract__appl_count_week__mean__1W',  'agg_spass_applications__g_contract__appl_count_week__mean__2W',  'agg_spass_applications__g_contract__appl_count_week__mean__3W',  'agg_spass_applications__g_contract__appl_count_week__mean__4W',  'agg_spass_applications__g_contract__appl_count_week__mean__5W',  'agg_spass_applications__g_contract__appl_count_week__mean__6W',  'agg_spass_applications__g_contract__appl_count_week__mean__8W',  'agg_spass_applications__g_contract__appl_count_week__mean__12W',  'agg_spass_applications__g_contract__appl_count_week__mean__26W',  'agg_spass_applications__g_contract__appl_count_week__mean__ALL_TIME',  'agg_spass_applications__g_specialization__appl_count_week__mean__ALL_TIME',  'counteragent_mean_agg_spass_applications__g_contract__appl_count_week__mean__ALL_TIME',  'agg_workers__g_contract__fact_workers__all__mean__1W',  'agg_workers__g_contract__fact_workers__all__mean__2W',  'agg_workers__g_contract__fact_workers__all__mean__3W',  'agg_workers__g_contract__fact_workers__all__mean__4W',  'agg_workers__g_contract__fact_workers__all__mean__5W',  'agg_workers__g_contract__fact_workers__all__mean__6W',  'agg_workers__g_contract__fact_workers__all__mean__8W',  'agg_workers__g_contract__fact_workers__all__mean__12W',  'agg_workers__g_contract__fact_workers__all__mean__26W',  'agg_workers__g_contract__fact_workers__all__mean__ALL_TIME',  'agg_materials__g_contract__order_id__countDistinct__1W',  'agg_materials__g_contract__order_id__countDistinct__2W',  'agg_materials__g_contract__order_id__countDistinct__4W',  'agg_materials__g_contract__order_id__countDistinct__8W',  'agg_materials__g_contract__order_id__countDistinct__12W',  'agg_materials__g_contract__order_id__countDistinct__ALL_TIME',  'agg_materials__g_contract__order_dt__min__ALL_TIME',  'agg_materials__g_contract__material_type_id__countDistinct__ALL_TIME',  'agg_materials__g_contract__material_id__countDistinct__ALL_TIME',  'agg_scontrol__g_contractor__close_delay__defect_type_repair__mean__ALL_TIME',  'agg_scontrol__g_contractor__close_delay__defect_type_author_supervision__mean__ALL_TIME',  'agg_scontrol__g_contractor__close_delay__defect_type_GR__mean__ALL_TIME',  'agg_scontrol__g_contractor__close_delay__defect_type_labour_protection__mean__ALL_TIME',  'agg_scontrol__g_contractor__close_delay__defect_type_general_contractor__mean__ALL_TIME',  'agg_scontrol__g_contractor__close_delay__defect_type_arch_supervision__mean__ALL_TIME',  'agg_scontrol__g_contractor__close_delay__defect_type_tech_supervision__mean__ALL_TIME',  'agg_scontrol__g_contractor__close_delay__defect_type_app__mean__ALL_TIME',  'agg_sroomer__g_contractor__sroomer_id__count__3M',  'agg_sroomer__g_contractor__sroomer_id__count__6M',  'agg_sroomer__g_contractor__sroomer_id__count__12M',  'agg_sroomer__g_contractor__sroomer_id__count__ALL_TIME',  'agg_ConsolidatedIndicator__g_contractor__Index__Overall__mean__ALL_TIME',  'agg_ConsolidatedIndicator__g_contractor__Index__FailureScore__mean__ALL_TIME',  'agg_ConsolidatedIndicator__g_contractor__Index__PaymentIndex__mean__ALL_TIME',  'agg_ConsolidatedIndicator__g_contractor__Index__IndexOfDueDiligence__mean__ALL_TIME',  'agg_spark_extended_report__g_contractor__EstimatedClaimsSum__last__ALL_TIME',  'agg_spark_extended_report__g_contractor__EstimatedLiabilitiesSum__last__ALL_TIME',  'agg_spark_extended_report__g_contractor__EstimatedNetLiabilitiesSum__last__ALL_TIME',  'agg_spark_extended_report__g_contractor__PledgeeActiveCount__last__ALL_TIME',  'agg_spark_extended_report__g_contractor__PledgeeCeasedCount__last__ALL_TIME',  'agg_spark_extended_report__g_contractor__PledgerActiveCount__last__ALL_TIME',  'agg_spark_extended_report__g_contractor__PledgerCeasedCount__last__ALL_TIME',  'agg_spark_extended_report__g_contractor__CompanySizeRevenue__last__ALL_TIME',  'agg_spark_extended_report__g_contractor__CreditLimitSum__last__ALL_TIME',  'agg_Finance__g_contractor__Value__FixedAssets__last__ALL_TIME',  'agg_Finance__g_contractor__Value__CurrentAssets__last__ALL_TIME',  'agg_Finance__g_contractor__Value__Capital__last__ALL_TIME',  'agg_Finance__g_contractor__Value__LongLiabilities__last__ALL_TIME',  'agg_Finance__g_contractor__Value__ShortLiabilities__last__ALL_TIME',  'agg_Finance__g_contractor__Value__Balance__last__ALL_TIME',  'agg_Finance__g_contractor__Value__Revenue_y__last__ALL_TIME',  'agg_Finance__g_contractor__Value__NetProfit_y__last__ALL_TIME',  'agg_Finance__g_contractor__Value__CostPrice_y__last__ALL_TIME',  'agg_FinanceAndTaxesFTS__g_contractor__Expenses__last__ALL_TIME',  'agg_FinanceAndTaxesFTS__g_contractor__Income__last__ALL_TIME',  'agg_FinanceAndTaxesFTS__g_contractor__TaxArrearsSum__last__ALL_TIME',  'agg_FinanceAndTaxesFTS__g_contractor__TaxesSum__last__ALL_TIME',  'agg_ArbitrationCases__g_contractor__DefendantSum__sum__12M',  'agg_ArbitrationCases__g_contractor__DefendantSum__sum__12_24M',  'agg_ArbitrationCases__g_contractor__DefendantSum__sum__12_36M',  'agg_ArbitrationCases__g_contractor__DefendantSum__sum__12_48M',  'agg_ArbitrationCases__g_contractor__DefendantSum__sum__ALL_TIME',  'agg_ArbitrationCases__g_contractor__PlaintiffSum__sum__12M',  'agg_ArbitrationCases__g_contractor__PlaintiffSum__sum__12_24M',  'agg_ArbitrationCases__g_contractor__PlaintiffSum__sum__12_36M',  'agg_ArbitrationCases__g_contractor__PlaintiffSum__sum__12_48M',  'agg_ArbitrationCases__g_contractor__PlaintiffSum__sum__ALL_TIME',  'agg_tender_proposal__g_contractor__id__ALL__countDistinct__12W']

In [4]:
# Main model class
class ScoringModel:
    def __init__(self, weights_path: Path) -> None:
        """ Инициализация модели ScoringModel: загрузка предобученной модели и импера. """
        # Путь к сохраненной модели и имперу
        sub_model_rfc_path = weights_path / "RandomForestClassifier_scoring.pickle"
        sub_model_cat_path = weights_path / "CatBoostClassifier_scoring.pickle"
        sub_model_xgb_path = weights_path / "XGBClassifier_scoring.pickle"
        meanMedianImputer_path = weights_path / "MeanMedianImputer.pickle"
        graph_path = weights_path / "graph.csv"
        data_columns_path = weights_path / "data_columns.json"
        rfc_graph_scoring_path = weights_path / "RandomForestClassifier_SUB_model_scoring.pickle"
        unic_data_path = weights_path / "unic_data.csv"
        distance_dict_path = weights_path / "distance_dict.json"
        train_prf_path = weights_path / "train_prf.csv"

        # Загрузка модели и импутера из файлов
        with open(sub_model_rfc_path, "rb") as fd:
            self.sub_model_rfc = pickle.load(fd)
        with open(sub_model_cat_path, "rb") as fd:
            self.sub_model_cat = pickle.load(fd)
        with open(sub_model_xgb_path, "rb") as fd:
            self.sub_model_xgb = pickle.load(fd)
        with open(meanMedianImputer_path, "rb") as fd:
            self.imputer = pickle.load(fd)
        with open(rfc_graph_scoring_path, "rb") as fd:
            self.rfc_graph_scoring = pickle.load(fd)
        self.graph = pd.read_csv(graph_path).drop(columns=['Unnamed: 0'])
        with open(data_columns_path, "r") as fd:
            self.data_columns = list(json.load(fd).keys())
        self.unic_data_df = pd.read_csv(unic_data_path)
        with open(distance_dict_path, "r") as fd:
            self.distance_dict: dict = json.load(fd)
            self.distance_dict = {int(k): v for k, v in self.distance_dict.items()}
        self.train_prf = pd.read_csv(train_prf_path)


        self.data_del_par = inference_data_del_par
        self.nan_par = inference_nan_par
        self.final_cols = inference_final_cols
        self.graph_del_col = graph_del_col
        self.graph_x_cols = graph_x_cols
        self.explainer = lime.lime_tabular.LimeTabularExplainer(
            self.train_prf[self.final_cols].to_numpy(), feature_names=self.final_cols)

        # Инициализация российских праздников
        self.ru_holidays = holidays.Russia()

    def preproc_data(self, data: pd.DataFrame) -> pd.DataFrame:
        """ 
        Предобработка входных данных для предсказания:
        - Удаление ненужных колонок
        - Создание временных признаков
        - Импутация отсутствующих значений
        - Удаление дополнительных колонок
        - Создание расширенных признаков
        - Снижение использования памяти
        """
        
        return data
    

    def preprocess_base_model(self, test_df: pd.DataFrame):
        data = test_df.copy()

        # Удаление колонок с пустыми или неинформативными данными
        data = data.drop(columns=self.data_del_par)

        # Импутация отсутствующих значений
        data[self.nan_par] = self.imputer.transform(data[self.nan_par])

        # Создание временных признаков
        data = self.create_time_features(data, 'contract_date', 'report_date')

        # Создание расширенных признаков
        data = self.create_advanced_features(data)

        data = self.work_graph(data)

        # Снижение использования памяти
        data = self.reduce_mem_usage(data)

        data = pd.get_dummies(data, columns=['specialization_id', 'project_id', 'building_id', 'contractor_id'], dtype=int)
        null_cols = set(self.data_columns) - set(data.columns)
        data[list(null_cols)] = 0

        return data

    def predict_base_model(self, data: pd.DataFrame) -> pd.DataFrame:
        """ 
        Предсказание вероятностей на основе предобработанных данных
        с помощью блендинга базовых моделей.
        """

        # Предсказание вероятностей для положительного класса
        pred_rfc = self.sub_model_rfc.predict_proba(data[self.final_cols])[:, 1]
        pred_cat = self.sub_model_cat.predict_proba(data[self.final_cols])[:, 1]
        pred_xgb = self.sub_model_xgb.predict_proba(data[self.final_cols])[:, 1]

        # Комбинирование предсказаний
        pred = pred_rfc * 0.9 + pred_xgb * 0.05 + pred_cat * 0.05

        result = pd.DataFrame({'contract_id': data['contract_id'], 'report_date': data['report_date'], 'score': pred})
        
        return result

    def predict_graph_scoring(self, test_df: pd.DataFrame, preds_df: pd.DataFrame) -> pd.DataFrame:
        data = test_df.copy()
        data = data.drop(columns=['contract_id', 'project_id', 'building_id'])
        data[self.nan_par] = self.imputer.transform(data[self.nan_par])
        data.fillna(0, inplace=True)
        data = data.drop(columns=self.graph_del_col)
        data = self.reduce_mem_usage(data)
        data = data.join(preds_df, rsuffix="_r")
        
        unic_data = self.unic_data_df
        distance_dict = self.distance_dict
        unic_data_gb = unic_data.groupby(['contractor_id'], as_index=False).median()
        res = []

        to_cnt_i = list()
        to_rs2_i = list()

        data = data.set_index(['contractor_id', 'score'])[self.graph_x_cols]

        for i, ((cn_id, scr), sf) in tqdm(enumerate(data.iterrows()), total=len(data)):
            if cn_id in distance_dict:
                d = unic_data_gb[unic_data.contractor_id == distance_dict[cn_id][0]]
                res2 = d.filter(like='contractor')
                if len(res2) != 0:
                    to_cnt_i.append(i)
                    to_rs2_i.append(res2.index[0])

            res.append(scr)
        
        res = np.array(res)
        tcn_df = data.iloc[to_cnt_i].copy().reset_index()
        tcn_df = tcn_df.filter(regex='^(?!.*contractor)', axis=1)
        tcn_rsd = unic_data_gb.loc[to_rs2_i]
        tcn_rsd = tcn_rsd.filter(like='contractor')      
        tcn_df[tcn_rsd.columns] = tcn_rsd.reset_index()[tcn_rsd.columns]
        preds_rfc = self.rfc_graph_scoring.predict_proba(tcn_df[self.graph_x_cols])[:, 1]
        preds_all = np.max([preds_rfc, res[to_cnt_i]], axis=0)
        res[to_cnt_i] = preds_all

        preds_df['score'] = res
        return preds_df

    def predict_result(self, data: pd.DataFrame, negotiate: bool = False) -> pd.DataFrame:
        data_prf = self.preprocess_base_model(data)
        preds_df = self.predict_base_model(data_prf)
        preds_df = self.predict_graph_scoring(data, preds_df)

        preds_df = preds_df.merge(preds_df.groupby('contract_id')['score'].max().reset_index(), how="left", on="contract_id")
        preds_df = preds_df[['contract_id', 'report_date', 'score_y']].rename(columns={'score_y': 'score'})

        if negotiate:
            preds_df['score'] = 1 - preds_df['score']

        return preds_df

    
    def predict_scoring(self, data: pd.DataFrame, negotiate: bool = False) -> np.ndarray:
        return self.predict_result(data, negotiate)['score'].values

    def explanation_col_normalizer(self, condition):
        """
        Converts lime's as_list() column names to normal output: (column_name, more_than, less_than) as more_than < column_name < less_than 
        Returns more_than and less_than are float values or None if inf
        """
        pattern = r'(((?P<conv_1>-?\d*\.?\d*)\s*(?P<sign_1>(<=|>=|>|<)))?\s*(?P<column>\w+)\s*((?P<sign_2>(<=|>=|>|<))\s*(?P<conv_2>-?\d*\.?\d*))?)'

        match = re.search(pattern, condition)
        if match:
            conv1 = match.group('conv_1')
            sign1 = match.group('sign_1')
            conv2 = match.group('conv_2')
            sign2 = match.group('sign_2')
            column_name = match.group('column')

            more_than = -np.inf
            less_than = np.inf

            if sign1 in ['<', '<=']:
                if conv1 == None or conv1 == '':
                    conv1 = float("-inf")
                else:
                    conv1 = float(conv1)

                more_than = max(more_than, conv1)
            elif sign1 in ['>', '>=']:
                if conv1 == None or conv1 == '':
                    conv1 = float("inf")
                else:
                    conv1 = float(conv1)

                less_than = min(less_than, conv1)

            if sign2 in ['<', '<=']:
                if conv2 == None or conv2 == '':
                    conv2 = float("inf")
                else:
                    conv2 = float(conv2)

                less_than = min(less_than, conv2)
            elif sign2 in ['>', '>=']:
                if conv2 == None or conv2 == '':
                    conv2 = float("-inf")
                else:
                    conv2 = float(conv2)

                more_than = max(more_than, conv2)
            
            if more_than == float("-inf"):
                more_than = None
            
            if less_than == float("inf"):
                less_than = None

            return (column_name, more_than, less_than)
        else:
            return ("", None, None)
    
    def filter_explanation_columns(self, col_name: str):
        return not col_name.startswith(("contractor_id", "building_id", "project_id", "specialization_id"))

    def generate_explanation_text(self, exp_dict: dict) -> str:
        positive_explanations = []
        negative_explanations = []

        columns_explanation = sorted(exp_dict['columns_explanation'], key=lambda x: x['importance'], reverse=True)
        if len(columns_explanation) > 10:
            columns_explanation = columns_explanation[:5] + columns_explanation[-5:]

        for column in columns_explanation:
            column_name = column['column_name']
            more_than = column['more_than']
            less_than = column['less_than']
            importance = column['importance']
            real_value = column['real_value']

            if more_than is not None and less_than is not None:
                explanation = f"{column_name}, потому что значение столбца {real_value:.2f} находится в промежутке от {more_than:.2f} до {less_than:.2f}"
            elif more_than is not None:
                explanation = f"{column_name}, потому что значение столбца {real_value:.2f} {"больше или равно" if f"{real_value:.2f}" == f"{more_than:.2f}" else "больше чем" } {more_than:.2f}"
            elif less_than is not None:
                explanation = f"{column_name}, потому что значение столбца {real_value:.2f} {"меньше или равно" if f"{real_value:.2f}" == f"{less_than:.2f}" else "меньше чем" } {less_than:.2f}"
            else:
                explanation = ""

            if importance > 0:
                positive_explanations.append(explanation)
            else:
                negative_explanations.append(explanation)

        explanation_text = "Самыми важными колонками за увеличение вероятности дефолта были:\n"
        explanation_text += "\n".join(f"{i + 1}) {explanation}" for i, explanation in enumerate(positive_explanations))

        explanation_text += "\n\nА самыми важными против вероятности дефолта:\n"
        explanation_text += "\n".join(f"{i + 1}) {explanation}" for i, explanation in enumerate(negative_explanations))

        return explanation_text
    
    def explanation_to_dict(self, exp: Any, row: pd.Series) -> dict:
        vals = row[list(filter(self.filter_explanation_columns, self.final_cols))].to_dict()
        predict = exp.predict_proba
        contract_id = row['contract_id']
        report_date = row['report_date']

        def expli_mapper(x):
            nm = self.explanation_col_normalizer(x[0])
            return {
                "column_name": nm[0],
                "more_than": nm[1],
                "less_than": nm[2],
                "importance": x[1]
            }
        
        cols_list = exp.as_list()
        cols_list = list(filter(lambda x: self.filter_explanation_columns(x['column_name']), map(lambda x: expli_mapper(x), cols_list)))
        for cd in cols_list:
            cd['real_value'] = vals[cd['column_name']]

        exp_dict = {
            "contract_id": contract_id,
            "report_date": report_date,
            "predict_score_negative": predict[0],
            "predict_score_positive": predict[1],
            "columns_explanation": cols_list
        }

        return exp_dict

    
    def explain_row(self, row: pd.Series) -> dict:
        exp = self.explainer.explain_instance(row[self.final_cols].values, self.sub_model_rfc.predict_proba, num_features=len(self.final_cols))
        return self.explanation_to_dict(exp, row)
    
    def explain_(self, rows: pd.DataFrame) -> list[dict]:
        explains = [self.explain_row(row) for _, row in rows.iterrows()]
        return explains
    
    def explain_all(self, data: pd.DataFrame) -> list[dict]:
        data_prf = self.preprocess_base_model(data)
        explains = self.explain_(data_prf)

        return explains 
    
    # Функция для определения сезона по месяцу даты
    # Принимает на вход объект даты (date) и возвращает строку с названием сезона
    def get_season(self, date) -> str:
        if date.month in [12, 1, 2]:
            return 'winter'  # Зима: декабрь, январь, февраль
        elif date.month in [3, 4, 5]:
            return 'spring'  # Весна: март, апрель, май
        elif date.month in [6, 7, 8]:
            return 'summer'  # Лето: июнь, июль, август
        else:
            return 'autumn'  # Осень: сентябрь, октябрь, ноябрь

    # Основная функция для добавления временных признаков в DataFrame
    # df - входной DataFrame, start_col - столбец с начальной датой, end_col - столбец с конечной датой
    def create_time_features(self, df: pd.DataFrame, start_col: str, end_col: str) -> pd.DataFrame:
        # Преобразуем столбцы с датами в формат datetime для работы с ними как с датами
        df[start_col] = pd.to_datetime(df[start_col])
        df[end_col] = pd.to_datetime(df[end_col])
        
        # Создаем новый столбец, который содержит диапазон всех дат от начальной до конечной
        df['date_range'] = df.apply(lambda row: pd.date_range(start=row[start_col], end=row[end_col]), axis=1)
        
        # Создаем столбец с общим количеством дней в диапазоне
        df['total_days'] = df['date_range'].apply(len)
        
        # Считаем долю зимних дней в диапазоне и нормализуем на общее количество дней
        df['winter'] = df['date_range'].apply(lambda dates: sum(self.get_season(date) == 'winter' for date in dates)) / df['total_days']
        
        # Считаем долю весенних дней в диапазоне и нормализуем на общее количество дней
        df['spring'] = df['date_range'].apply(lambda dates: sum(self.get_season(date) == 'spring' for date in dates)) / df['total_days']
        
        # Считаем долю летних дней в диапазоне и нормализуем на общее количество дней
        df['summer'] = df['date_range'].apply(lambda dates: sum(self.get_season(date) == 'summer' for date in dates)) / df['total_days']
        
        # Считаем долю осенних дней в диапазоне и нормализуем на общее количество дней
        df['autumn'] = df['date_range'].apply(lambda dates: sum(self.get_season(date) == 'autumn' for date in dates)) / df['total_days']
        
        # Считаем долю праздничных дней в диапазоне на основе списка российских праздников
        df['holidays'] = df['date_range'].apply(lambda dates: sum(date in self.ru_holidays for date in dates)) / df['total_days']
        
        # Считаем долю выходных дней (суббота и воскресенье) и нормализуем на общее количество дней
        df['weekends'] = df['date_range'].apply(lambda dates: sum(date.weekday() >= 5 for date in dates)) / df['total_days']
        
        # Считаем долю рабочих дней (понедельник-пятница) с учетом того, что праздники также считаются нерабочими
        df['workdays'] = df['date_range'].apply(lambda dates: sum(date.weekday() < 5 for date in dates) - sum(date in self.ru_holidays for date in dates)) / df['total_days']
        
        # Функция для подсчета количества "длинных" выходных
        # Длинные выходные определяются как выходные дни, соединенные с праздничными днями
        def count_long_weekends(dates):
            long_weekends = 0
            for i in range(1, len(dates)):  # Начинаем со второго дня диапазона
                # Если предыдущий день был выходным, а текущий — праздничным, увеличиваем счетчик длинных выходных
                if dates[i - 1].weekday() >= 5 and dates[i] in self.ru_holidays:
                    long_weekends += 1
            return long_weekends
        
        # Добавляем столбец с долей длинных выходных дней в общем количестве дней
        df['long_weekends'] = df['date_range'].apply(lambda dates: count_long_weekends(dates)) / df['total_days']
        
        # Нормализуем общее количество дней на 366, чтобы учесть високосные годы
        df['total_days'] = df['total_days'] / 366

        return df  # Возвращаем DataFrame с новыми признаками 

    def reduce_mem_usage(self, df: pd.DataFrame) -> pd.DataFrame:
        """ Проходит по всем столбцам DataFrame и изменяет тип данных
            для уменьшения использования памяти.
        """
        start_mem = df.memory_usage().sum() / 1024**2
        print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

        for col in df.columns:
            col_type = df[col].dtype.name

            if col_type not in ['object', 'category', 'datetime64[ns, UTC]']:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)
                elif str(col_type)[:5] == 'float':
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)

        end_mem = df.memory_usage().sum() / 1024**2
        print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

        return df

    # Функция для создания дополнительных признаков на основе финансовых и других данных
    def create_advanced_features(self, df: pd.DataFrame) -> pd.DataFrame:
        
        # Финансовые коэффициенты
        
        # Текущая ликвидность (Current Ratio)
        # Показывает отношение текущих активов компании к её краткосрочным обязательствам.
        # Формула: Текущие активы / Краткосрочные обязательства
        df['current_ratio'] = df['agg_Finance__g_contractor__Value__CurrentAssets__last__ALL_TIME'] / df['agg_Finance__g_contractor__Value__ShortLiabilities__last__ALL_TIME']
        
        # Соотношение долга к капиталу (Debt to Equity Ratio)
        # Показывает соотношение долгов компании (краткосрочные и долгосрочные обязательства) к её собственному капиталу.
        # Формула: (Краткосрочные обязательства + Долгосрочные обязательства) / Собственный капитал
        df['debt_to_equity'] = (df['agg_Finance__g_contractor__Value__ShortLiabilities__last__ALL_TIME'] + df['agg_Finance__g_contractor__Value__LongLiabilities__last__ALL_TIME']) / df['agg_Finance__g_contractor__Value__Capital__last__ALL_TIME']
        
        # Рентабельность активов (Return on Assets, ROA)
        # Показывает, насколько эффективно компания использует свои активы для получения прибыли.
        # Формула: Чистая прибыль / Общие активы
        df['return_on_assets'] = df['agg_Finance__g_contractor__Value__NetProfit_y__last__ALL_TIME'] / df['agg_Finance__g_contractor__Value__Balance__last__ALL_TIME']
        
        # Динамика изменения финансовых и судебных показателей
        
        # Изменение сумм исков против компании за последние 12 месяцев по сравнению с предыдущими 12-24 месяцами
        df['claims_change_12_24'] = df['agg_ArbitrationCases__g_contractor__DefendantSum__sum__12M'] - df['agg_ArbitrationCases__g_contractor__DefendantSum__sum__12_24M']
        
        # Изменение сумм исков против компании за предыдущие 12-24 месяца по сравнению с 24-36 месяцами
        df['claims_change_24_36'] = df['agg_ArbitrationCases__g_contractor__DefendantSum__sum__12_24M'] - df['agg_ArbitrationCases__g_contractor__DefendantSum__sum__12_36M']
        
        # Изменение прибыли компании относительно её выручки
        # Показывает, насколько прибыльной была компания относительно общего дохода (выручки).
        # Формула: Чистая прибыль / Выручка
        df['profit_change'] = df['agg_Finance__g_contractor__Value__NetProfit_y__last__ALL_TIME'] / df['agg_Finance__g_contractor__Value__Revenue_y__last__ALL_TIME']
        
        # Суммарные показатели и рейтинги
        
        # Индекс надежности компании
        # Рассчитывается как среднее значение нескольких индексов, которые отражают общее состояние компании,
        # её риск несостоятельности, платежеспособность и уровень добросовестности.
        df['reliability_index'] = (
            df['agg_ConsolidatedIndicator__g_contractor__Index__Overall__mean__ALL_TIME'] +  # Общий индекс компании
            df['agg_ConsolidatedIndicator__g_contractor__Index__FailureScore__mean__ALL_TIME'] +  # Индекс вероятности банкротства
            df['agg_ConsolidatedIndicator__g_contractor__Index__PaymentIndex__mean__ALL_TIME'] +  # Платежный индекс
            df['agg_ConsolidatedIndicator__g_contractor__Index__IndexOfDueDiligence__mean__ALL_TIME']  # Индекс добросовестности
        ) / 4  # Среднее значение этих индексов
        
        # Активность по залогам
        
        # Активность по залогам — разница между количеством активных залогов и прекращённых залогов.
        df['pledger_activity'] = df['agg_spark_extended_report__g_contractor__PledgerActiveCount__last__ALL_TIME'] - df['agg_spark_extended_report__g_contractor__PledgerCeasedCount__last__ALL_TIME']
        
        # Исторические данные по искам
        
        # Общая сумма исков за последние 12 месяцев (ответчик и истец)
        df['total_claims_last_12_months'] = df['agg_ArbitrationCases__g_contractor__DefendantSum__sum__12M'] + df['agg_ArbitrationCases__g_contractor__PlaintiffSum__sum__12M']
        
        # Общая сумма исков за последние 24 месяца (ответчик и истец)
        df['total_claims_last_24_months'] = df['agg_ArbitrationCases__g_contractor__DefendantSum__sum__12_24M'] + df['agg_ArbitrationCases__g_contractor__PlaintiffSum__sum__12_24M']
        
        # Логарифмическое преобразование
        
        # Логарифм от оценочной суммы обязательств компании
        df['agg_spark_extended_report__g_contractor__EstimatedLiabilitiesSum__last__ALL_TIME'] = np.log1p(df['agg_spark_extended_report__g_contractor__EstimatedLiabilitiesSum__last__ALL_TIME'])
        
        # Логарифм от выручки компании за последний год
        df['agg_Finance__g_contractor__Value__Revenue_y__last__ALL_TIME'] = np.log1p(df['agg_Finance__g_contractor__Value__Revenue_y__last__ALL_TIME'])
        
        # Новые финансовые признаки
        
        # Абсолютное изменение суммы контракта (текущая сумма минус начальная сумма)
        df['contract_sum_change'] = df['contract_current_sum'] - df['contract_init_sum']
        
        # Процентное изменение суммы контракта (относительное изменение текущей суммы к начальной)
        # Если значение бесконечно (например, начальная сумма контракта была 0), заменяем его на NaN
        df['contract_sum_change_percentage'] = (df['contract_sum_change'] / df['contract_init_sum']).replace([np.inf, -np.inf], np.nan)
        
        # Логарифм от текущей суммы контракта (логарифмическое преобразование для нормализации данных)
        df['contract_current_sum'] = np.log1p(df['contract_current_sum'])
        
        # Нормализуем изменение суммы контракта по общему количеству контрактов
        # Делим изменение суммы контракта на количество контрактов, чтобы получить среднее изменение на контракт
        df['contract_change_per_contract'] = df['contract_sum_change'] / df['agg_all_contracts__g_contract__bit_da_guid__isMain__count__ALL_TIME']
        
        return df  # Возвращаем DataFrame с новыми признаками 


    def work_graph(self, df: pd.DataFrame) -> pd.DataFrame:
        # Создаем словари для contractor_id1
        contractor1_group = self.graph.groupby('contractor_id1')['Distance']
        
        contractor1_min = contractor1_group.min().to_dict()
        contractor1_max = contractor1_group.max().to_dict()
        contractor1_mean = contractor1_group.mean().to_dict()
        contractor1_median = contractor1_group.median().to_dict()
        contractor1_sum = contractor1_group.sum().to_dict()
        contractor1_count = contractor1_group.count().to_dict()

        # Создаем словари для contractor_id2
        contractor2_group = self.graph.groupby('contractor_id2')['Distance']
        
        contractor2_min = contractor2_group.min().to_dict()
        contractor2_max = contractor2_group.max().to_dict()
        contractor2_mean = contractor2_group.mean().to_dict()
        contractor2_median = contractor2_group.median().to_dict()
        contractor2_sum = contractor2_group.sum().to_dict()
        contractor2_count = contractor2_group.count().to_dict()

        # Используем словари для ускорения поиска
        df['Distance_to_contractor_min'] = df['contractor_id'].map(contractor1_min).fillna(-1)
        df['Distance_to_contractor_max'] = df['contractor_id'].map(contractor1_max).fillna(-1)
        df['Distance_to_contractor_mean'] = df['contractor_id'].map(contractor1_mean).fillna(-1)
        df['Distance_to_contractor_median'] = df['contractor_id'].map(contractor1_median).fillna(-1)
        df['Distance_to_contractor_sum'] = df['contractor_id'].map(contractor1_sum).fillna(-1)
        df['Distance_to_contractor_count'] = df['contractor_id'].map(contractor1_count).fillna(0)
        
        df['Distance_from_contractor_min'] = df['contractor_id'].map(contractor2_min).fillna(-1)
        df['Distance_from_contractor_max'] = df['contractor_id'].map(contractor2_max).fillna(-1)
        df['Distance_from_contractor_mean'] = df['contractor_id'].map(contractor2_mean).fillna(-1)
        df['Distance_from_contractor_median'] = df['contractor_id'].map(contractor2_median).fillna(-1)
        df['Distance_from_contractor_sum'] = df['contractor_id'].map(contractor2_sum).fillna(-1)
        df['Distance_from_contractor_count'] = df['contractor_id'].map(contractor2_count).fillna(0)

        return df

    @property
    def feature_importances_(self):
        return pd.DataFrame({"feature": self.final_cols, "importance": self.sub_model_rfc.feature_importances_}).sort_values("importance", ascending=False)



In [5]:
model = ScoringModel(Path(WEIGHTS_PATH))

In [None]:
test_df  = pd.read_csv(TEST_DATAFRAME_PATH)

In [7]:
OUTPUT_PATH = Path(OUTPUT_DIR_PATH)

In [8]:
submission = model.predict_result(test_df)

Memory usage of dataframe is 0.01 MB
Memory usage after optimization is: 0.00 MB
Decreased by 72.4%
Memory usage of dataframe is 0.01 MB
Memory usage after optimization is: 0.00 MB
Decreased by 72.4%


  0%|          | 0/5 [00:00<?, ?it/s]

In [9]:
submission.to_csv(OUTPUT_PATH / "submission.csv", index=False)

In [10]:
test_prf = model.preprocess_base_model(test_df)
expds = list()

for _, rw in tqdm(test_prf.iterrows()):
    expr = model.explain_row(rw)
    expr['report_date'] = str(expr['report_date'])
    expds.append(expr)
    expst = model.generate_explanation_text(expr)
    expr['interpretation'] = expst


with open(OUTPUT_PATH / "explanations.json", 'w') as fd:
    json.dump(expds, fd, ensure_ascii=False)

Memory usage of dataframe is 0.01 MB
Memory usage after optimization is: 0.00 MB
Decreased by 72.4%


0it [00:00, ?it/s]

In [11]:
submission_final = submission.join(pd.DataFrame(expds), rsuffix="_r")[['contract_id', 'report_date', 'score', 'interpretation']]

In [13]:
submission_final.to_csv(OUTPUT_PATH / "submission_final.csv", index=False)