In [1]:
import pandas as pd
import numpy as np

import re

from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from tqdm.notebook import tqdm

import yaml
import joblib

In [2]:
def extract_purchases(string):
    return list(map(int, re.findall(r"'(\d+)'", string)))

In [3]:
config_path = "../config/params.yaml"
config = yaml.load(open(config_path), Loader=yaml.FullLoader)

preproc = config["preprocessing"]
recommender_train = config['train']['recommender']
win_predictor_train = config['train']['win_predictor']
evaluate = config['evaluate']

# Evaluate

In [4]:
df_train = pd.read_csv(preproc['train_data'])
df_train = df_train.set_index('index')
df_train[:5]

Unnamed: 0_level_0,purchase,forsmallbiz,price,customer,supplier,is_winner,vectorized,month,reg_code,purchase_size,flag_won,n_unique_okpd2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,141936,1,406903.35,10960,40062,0,[ 0.12142407 -0.0336704 -0.00717449 -0.101091...,2,62.0_26,1,0.0,3
1,75199,1,299821.66,7160,40062,0,[ 0.02812369 0.02294252 0.0104262 0.006939...,3,26.2_65,4,0.0,3
2,97840,1,366250.0,8259,40062,0,[ 0.08153069 0.03020425 -0.01425114 -0.042313...,2,62.0_72,5,0.0,3
3,64052,1,85013.0,6350,40062,0,[ 7.34494067e-02 3.06627049e-02 -1.24606798e-...,2,58.2_46,6,0.0,3
4,432364,0,13554.0,9788,5495,1,[ 0.09296399 0.19767287 -0.05634406 -0.004251...,7,drug_77,1,0.0,2


In [5]:
df_evaluate = pd.read_csv(evaluate['evaluate_data'])
df_evaluate = df_evaluate.set_index('index')
df_evaluate[:5]

Unnamed: 0_level_0,purchase,region_code,min_publish_date,forsmallbiz,price,customer,okpd2_code,supplier,vectorized,is_winner
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,596065,77,2021-03-26,0,53874.0,9582,drug,2940,[ 0.12289912 0.24247403 -0.06769952 -0.004744...,1
1,467821,77,2022-09-23,1,7500000.0,9591,38.2,7538,[ 0.05361487 0.01407206 -0.00556184 -0.023678...,1
2,88928,68,2022-08-05,1,281370.35,7831,26.2,11016,[ 0.07224658 0.01950507 0.01632774 0.025729...,0
3,137498,89,2022-03-24,1,1732800.0,10312,32.5,3939,[ 1.52509879e-02 2.62222039e-02 -1.16917311e-...,0
4,593712,77,2022-08-31,1,320725.5,9600,32.5,112854,[ 0.06823339 0.08441228 -0.01269019 -0.001212...,0


In [6]:
df_submission = pd.read_csv(evaluate['recommend_sub_path'])
df_submission = df_submission.set_index('index')
df_submission[:5]

Unnamed: 0_level_0,purchases
index,Unnamed: 1_level_1
1,"['148757', '151442', '1729', '2435']"
10,['20']
100,['162249']
10000,"['73654', '23926']"
100008,['608877']


In [7]:
df_submission = df_submission['purchases'].apply(extract_purchases)

In [8]:
# отсортируем по длине массива
df_submission = df_submission.reindex(df_submission.apply(len).sort_values(
    ascending=False).index)

In [9]:
def extract_words(string):
    return list(map(float, string[1:-1].split()))


def get_month(df_test):
    return df_test['min_publish_date'].apply(lambda x: int(x.split('-')[1]))


def add_reg_code(df_test):
    return df_test['okpd2_code'].astype('str') + '_' \
                 + df_test['region_code'].astype('str')


def add_purchase_size(df_test):
    return df_test.merge(df_test.groupby('purchase')['supplier'] \
                                .size().to_frame('purchase_size'), 
                         on='purchase', how='outer')

    
def add_flag(df_test, df_train):
    return df_test.merge(df_train[['supplier', 'customer', 'flag_won']] \
                                 .groupby(['supplier', 'customer']).tail(1), 
                         on=['supplier', 'customer'], how='left').fillna(0)


def add_unique_okpd2(df_test, df_train):
    return df_test.merge(df_train[['supplier', 'n_unique_okpd2']] \
                                 .groupby('supplier').tail(1), 
                         on=['supplier'], how='left').fillna(1)


def generate_features(df_test, df_train):
    """
    Генерация признаков
    """
    # Выделение месяца публикации закупки
    df_test['month'] = get_month(df_test)
    
    # Выделение инедентификатора, объединяющего регион и ОКПД закупки
    df_test['reg_code'] = add_reg_code(df_test)
    
    # Выделение кол-ва участников в закупке
    df_test = add_purchase_size(df_test)
    
    # Выделение флага работы поставщика с покупателем
    df_test = add_flag(df_test, df_train)
    
    # Выделение кол-ва уникальных ОКПД поставщика
    df_test = add_unique_okpd2(df_test, df_train)
    
    return df_test 

In [10]:
def pipeline_preproc(df_test: pd.DataFrame, df_train: pd.DataFrame) -> tuple:
    """
    Функция выполняет предобработку датасетов.

    Args:
        df_test (pd.DataFrame): тестовый датасет
        df_train (pd.DataFrame): тренировочный датасет

    Returns:
        tuple: кортеж из двух элементов, содержащий измененные тестовый и 
        тренировочный датасеты
    """

    # Выделение вектора токенов из строки
    df_train['vectorized'] = df_train['vectorized'].apply(extract_words)
    df_test['vectorized'] = df_test['vectorized'].apply(extract_words)
    
    # Генерация признаков
    df_test = generate_features(df_test, df_train)
    
    df_test = df_test.drop(columns=preproc['drop_columns'])
    
    # Преобразование типов столбцов
    df_train = df_train.astype(preproc['change_type_columns'])
    df_test = df_test.astype(preproc['change_type_columns'])

    
    return df_test, df_train

In [None]:
df_evaluate, df_train = pipeline_preproc(df_evaluate, df_train)

In [None]:
df_evaluate[:5]

## Recommender

In [None]:
def transform_vector(df_train, df_test):
    """
    Генерация признаков на основе вектора токенов
    """
    for i in range(100):
        df_train[str(i)] = df_train['vectorized'].apply(lambda x: x[i])
        df_test[str(i)] = df_test['vectorized'].apply(lambda x: x[i])
        
    return df_train, df_test


def supplier_data(df_train: pd.DataFrame, df_test: pd.DataFrame, 
                  sup: str) -> tuple:
    """
    Filters train and test DataFrames based on the unique reg_code of a given supplier.
    Removes unnecessary columns and duplicates.
    Drops purchases that exist in both train and test DataFrames.
    
    Parameters:
    -----------
    df_train: pd.DataFrame
    The train DataFrame
    df_test: pd.DataFrame
    The test DataFrame
    sup: str
    The name of the supplier to filter the DataFrames by

    Returns:
    --------
    Tuple[pd.DataFrame, pd.DataFrame]
    A tuple of filtered DataFrames for train and test, respectively.
    """


    unique_reg_okpd = df_train[df_train['supplier'] == sup]['reg_code'].unique()
    
    # фильтруем train и test на основе уникальных reg_code поставщиков 
    df_sup_train = df_train[df_train['reg_code'].isin(unique_reg_okpd)]
    df_sup_test = df_test[df_test['reg_code'].isin(unique_reg_okpd)]

    
    if df_sup_test.empty:
        df_sup_test = df_test
    
    # удаляем ненужные для системы рекомендаций стобцы и дубликаты
    df_sup_train = df_sup_train.drop(columns=recommender_train['drop_columns']) \
                               .drop_duplicates()
    df_sup_test = df_sup_test.drop(columns=recommender_train['drop_columns']) \
                             .drop_duplicates()
    

    df_sup_test = df_sup_test.set_index('purchase')
    df_sup_train = df_sup_train.set_index('purchase')
    
    # удаляем закупки, которые есть и test, и в train
    df_sup_train = df_sup_train.drop(set(df_submission[sup]).intersection(df_sup_train.index))
    df_sup_test = df_sup_test[~df_sup_test.index.isin(df_sup_train.index)]
    
    
    return df_sup_train, df_sup_test   


def compute_recommends(df_train: pd.DataFrame, 
                       df_evaluate: pd.DataFrame, 
                       df_submission: pd.DataFrame, 
                       models: dict, n: int) -> pd.Series:
    """
    Определение рекомендаций для n наиболее активных пользователей
    """
    recommends = pd.Series()
    
    # Преобразование векторов
    df_train, df_evaluate = transform_vector(df_train, df_evaluate)
    
    # Итерация по каждому поставщику из набора df_submission
    for sup in tqdm(df_submission.index[:n]):
        
        # Получение набора данных для оценки для текущего поставщика
        df_sup_evaluate = supplier_data(df_train, df_evaluate, sup)[1]
        
        # Предсказание моделью
        y_pred = models[sup].predict(df_sup_evaluate)
        
        # Получение рекомендуемых товаров
        recommends[str(sup)] = df_sup_evaluate[y_pred==1].index.tolist()
    
    
    return recommends

In [None]:
models = joblib.load(recommender_train['models'])

In [23]:
recommends = compute_recommends(df_train.copy(), df_evaluate.copy(), 
                                df_submission, models, 100)

  recommends = pd.Series()
  df_train[str(i)] = df_train['vectorized'].apply(lambda x: x[i])
  df_test[str(i)] = df_test['vectorized'].apply(lambda x: x[i])
  df_train[str(i)] = df_train['vectorized'].apply(lambda x: x[i])
  df_test[str(i)] = df_test['vectorized'].apply(lambda x: x[i])
  df_train[str(i)] = df_train['vectorized'].apply(lambda x: x[i])
  df_test[str(i)] = df_test['vectorized'].apply(lambda x: x[i])


  0%|          | 0/100 [00:00<?, ?it/s]

In [24]:
recommends[:5]

694     [588862, 101380, 160529, 152929, 152298, 51828...
5495    [430622, 433589, 605138, 595582, 430410, 42893...
2468    [211473, 601367, 604033, 34831, 76616, 101291,...
12      [115672, 587178, 167221, 146021, 20005, 78767,...
2622    [99074, 136902, 410088, 596704, 34842, 158353,...
dtype: object

## Probability of winning

In [25]:
df_evaluate[:5]

Unnamed: 0,purchase,forsmallbiz,price,customer,supplier,vectorized,is_winner,month,reg_code,purchase_size,flag_won,n_unique_okpd2
0,596065,0,53874.0,9582,2940,"[0.12289912, 0.24247403, -0.06769952, -0.00474...",1,3,drug_77,1,1.0,2
1,467821,1,7500000.0,9591,7538,"[0.05361487, 0.01407206, -0.00556184, -0.02367...",1,9,38.2_77,1,0.0,4
2,88928,1,281370.35,7831,11016,"[0.07224658, 0.01950507, 0.01632774, 0.0257298...",0,8,26.2_68,3,0.0,5
3,88928,1,281370.35,7831,574,"[0.07224658, 0.01950507, 0.01632774, 0.0257298...",0,8,26.2_68,3,0.0,8
4,88928,1,281370.35,7831,7749,"[0.07224658, 0.01950507, 0.01632774, 0.0257298...",0,8,26.2_68,3,0.0,8


In [28]:
def remove_columns(df_test):
    df_test = df_test.drop(columns=win_predictor_train['drop_columns'])
    
    return df_test.drop('is_winner', axis=1)

def get_meta_features(x_test: pd.DataFrame, 
                       catboost: CatBoostClassifier, 
                       xgboost: XGBClassifier) -> np.ndarray:
    """
    Функция для получения мета-признаков на тестовых данных.

    Аргументы:
    - x_test: pd.DataFrame, тестовые данные, на которых будет делаться предсказание.
    - catboost: CatBoostClassifier, обученная модель CatBoost.
    - xgboost: XGBClassifier, обученная модель XGBoost.

    Возвращает:
    - predictions_test: np.ndarray, массив значений предсказанных классов для каждого объекта тестовых данных.
    """

    # Предсказание вероятности класса 1 на тестовых данных с помощью CatBoost
    y_score = catboost.predict_proba(x_test)
    predictions_test = y_score[:, 1]
    
    # Предсказание вероятности класса 1 на тестовых данных с помощью XGBoost
    y_score = xgboost.predict_proba(x_test)
    predictions_test = np.vstack((predictions_test, y_score[:, 1]))
    
    # Транспонирование массива и возврат его значения
    return predictions_test.T


def compute_winner(df_evaluate: pd.DataFrame, 
                    models: dict) -> np.ndarray:
    """
    Функция для получения прогноза победителя в торгах на основе мета-признаков.

    Аргументы:
    - df_evaluate: pd.DataFrame, данные для оценки, на основе которых будет делаться прогноз.
    - models: dict, словарь, содержащий обученные модели для прогнозирования.

    Возвращает:
    - np.ndarray, массив, содержащий вероятности того, что поставщик победит в торгах.
    """

    # Удаление столбцов, которые не нужны для предсказания
    df_evaluate = remove_columns(df_evaluate)
    
    # Получение мета-признаков с помощью функции get_meta_features
    meta_features = get_meta_features(df_evaluate, 
                                      models['catboost'], 
                                      models['xgboost'])
    
    # Предсказание вероятности класса 1 на мета-признаках с помощью Naive Bayes
    return models['Naive_bayes'].predict_proba(meta_features)[:, 1]

In [29]:
models = joblib.load(win_predictor_train['models'])

In [30]:
compute_winner(df_evaluate, models)

array([0.99999998, 0.98055527, 0.35806481, ..., 0.99999451, 0.99848522,
       0.99992103])