In [None]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, \
                            recall_score, f1_score, log_loss

from lightgbm import LGBMClassifier

import optuna

from typing import Tuple, Set

import yaml

import joblib

import re

import warnings
warnings.filterwarnings('ignore')

In [None]:
pd.options.mode.chained_assignment = None
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [None]:
def extract_purchases(string):
    return list(map(int, re.findall(r"'(\d+)'", string)))

In [None]:
def extract_vector(string):
    return list(map(float, string[1:-1].split()))

In [None]:
def get_metrics(y_test, y_pred, y_score, name = "Default"):
    """Метрики для задачи классификации"""
    df_metrics = pd.DataFrame()

    df_metrics['model'] = [name]
    df_metrics['Accuracy'] = accuracy_score(y_test, y_pred)
    
    try:
        df_metrics['ROC_AUC'] = roc_auc_score(y_test, y_score[:, 1])
        df_metrics['Precision'] = precision_score(y_test, y_pred, zero_division=0)
        df_metrics['Recall'] = recall_score(y_test, y_pred, zero_division=0)
        df_metrics['f1'] = f1_score(y_test, y_pred, zero_division=0)
        df_metrics['Logloss'] = log_loss(y_test, y_score)
        
    except ValueError:
        df_metrics['ROC_AUC'] = 0
        df_metrics['Precision'] = 0
        df_metrics['Recall'] = 0
        df_metrics['f1'] = 0
        df_metrics['Logloss'] = 0

    return df_metrics

In [None]:
def open_file(file_path):
    with open(file_path, 'r') as file:
        return yaml.safe_load(file)
    
def save_file(file_path, data):       
    with open(file_path, 'w') as file:
        yaml.dump(data, file)

In [None]:
config_path = "../config/params.yaml"
config = yaml.load(open(config_path), Loader=yaml.FullLoader)

preproc = config["preprocessing"]
train = config["train"]['recommender']

In [None]:
recommender_params = open_file(train['params'])

In [25]:
recommender_metrics = {}

# Baseline

В данном блоке мы строим и обучаем рекомендательные систему. Также подбираем параметры для моделей с помощью байесовского оптимизатора.  

Результатом этого блока являются файл с моделью на каждого поставщика, лучшими параметрами и метриками качества моделей.

In [9]:
df_train = pd.read_csv(preproc['train_data'])
df_train = df_train.set_index('index')

df_train[:5]

Unnamed: 0_level_0,purchase,forsmallbiz,price,customer,supplier,is_winner,vectorized,month,reg_code,purchase_size,flag_won,n_unique_okpd2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,141936,1,406903.35,10960,40062,0,[ 0.12142407 -0.0336704 -0.00717449 -0.101091...,2,62.0_26,1,0.0,3
1,75199,1,299821.66,7160,40062,0,[ 0.02812369 0.02294252 0.0104262 0.006939...,3,26.2_65,4,0.0,3
2,97840,1,366250.0,8259,40062,0,[ 0.08153069 0.03020425 -0.01425114 -0.042313...,2,62.0_72,5,0.0,3
3,64052,1,85013.0,6350,40062,0,[ 7.34494067e-02 3.06627049e-02 -1.24606798e-...,2,58.2_46,6,0.0,3
4,432364,0,13554.0,9788,5495,1,[ 0.09296399 0.19767287 -0.05634406 -0.004251...,7,drug_77,1,0.0,2


In [10]:
df_test = pd.read_csv(preproc['test_data'])
df_test = df_test.set_index('index')

df_test[:5]

Unnamed: 0_level_0,purchase,forsmallbiz,price,customer,supplier,is_winner,vectorized,month,reg_code,purchase_size,flag_won,n_unique_okpd2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,596065,0,53874.0,9582,2940,1,[ 0.12289912 0.24247403 -0.06769952 -0.004744...,3,drug_77,1,1.0,2
1,467821,1,7500000.0,9591,7538,1,[ 0.05361487 0.01407206 -0.00556184 -0.023678...,9,38.2_77,1,0.0,4
2,88928,1,281370.35,7831,11016,0,[ 0.07224658 0.01950507 0.01632774 0.025729...,8,26.2_68,3,0.0,5
3,88928,1,281370.35,7831,574,0,[ 0.07224658 0.01950507 0.01632774 0.025729...,8,26.2_68,3,0.0,8
4,88928,1,281370.35,7831,7749,0,[ 0.07224658 0.01950507 0.01632774 0.025729...,8,26.2_68,3,0.0,8


In [11]:
df_submission = pd.read_csv(preproc['recommend_sub_path'])
df_submission = df_submission.set_index('index')

df_submission[:5]

Unnamed: 0_level_0,purchases
index,Unnamed: 1_level_1
1,"['148757', '151442', '1729', '2435']"
10,['20']
100,['162249']
10000,"['73654', '23926']"
100008,['608877']


In [12]:
df_submission = df_submission['purchases'].apply(extract_purchases)

In [13]:
# Генерация признаков на основе вектора токенов
df_train['vectorized'] = df_train['vectorized'].apply(extract_vector)
df_test['vectorized'] = df_test['vectorized'].apply(extract_vector)

In [14]:
# Преобразование типов столбцов
df_train = df_train.astype(preproc['change_type_columns'])
df_test = df_test.astype(preproc['change_type_columns'])

In [15]:
# Преобрауем вектор в признаки объекта
for i in tqdm(range(100)):
    df_train[str(i)] = df_train['vectorized'].apply(lambda x: x[i])
    df_test[str(i)] = df_test['vectorized'].apply(lambda x: x[i])

  0%|          | 0/100 [00:00<?, ?it/s]

In [16]:
df_train[:5]

Unnamed: 0_level_0,purchase,forsmallbiz,price,customer,supplier,is_winner,vectorized,month,reg_code,purchase_size,...,90,91,92,93,94,95,96,97,98,99
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,141936,1,406903.35,10960,40062,0,"[0.12142407, -0.0336704, -0.00717449, -0.10109...",2,62.0_26,1,...,-0.005042,0.01309,0.001511,-0.020377,0.031295,0.017915,0.030327,0.025129,-0.008975,-0.017989
1,75199,1,299821.66,7160,40062,0,"[0.02812369, 0.02294252, 0.0104262, 0.00693974...",3,26.2_65,4,...,-0.050555,0.002244,-0.017551,-0.01928,0.015864,0.0276,0.003926,-0.018871,0.006411,-0.028425
2,97840,1,366250.0,8259,40062,0,"[0.08153069, 0.03020425, -0.01425114, -0.04231...",2,62.0_72,5,...,-0.022614,-0.006095,0.044239,-0.041233,-0.019609,-0.028953,-0.019426,0.002454,0.009306,-0.030897
3,64052,1,85013.0,6350,40062,0,"[0.0734494067, 0.0306627049, -0.0124606798, -0...",2,58.2_46,6,...,-0.087519,-9.6e-05,0.032252,-0.01121,-0.002309,0.032456,-0.004214,0.000729,0.006364,-0.025677
4,432364,0,13554.0,9788,5495,1,"[0.09296399, 0.19767287, -0.05634406, -0.00425...",7,drug_77,1,...,0.00258,-0.022777,0.000719,0.052623,-0.006604,0.019618,0.023522,0.010618,0.010019,-0.005939


In [17]:
df_test[:5]

Unnamed: 0_level_0,purchase,forsmallbiz,price,customer,supplier,is_winner,vectorized,month,reg_code,purchase_size,...,90,91,92,93,94,95,96,97,98,99
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,596065,0,53874.0,9582,2940,1,"[0.12289912, 0.24247403, -0.06769952, -0.00474...",3,drug_77,1,...,-0.009704,0.014609,0.021662,-0.02695,0.032274,0.005955,0.016249,-0.020369,-0.004099,-0.020508
1,467821,1,7500000.0,9591,7538,1,"[0.05361487, 0.01407206, -0.00556184, -0.02367...",9,38.2_77,1,...,0.006707,-0.005902,-0.010647,-0.024653,-0.014679,-0.01712,0.017729,0.019125,0.01177,0.018462
2,88928,1,281370.35,7831,11016,0,"[0.07224658, 0.01950507, 0.01632774, 0.0257298...",8,26.2_68,3,...,0.026546,-0.114093,0.093395,0.047376,-0.060443,0.010879,-0.024136,0.025733,0.009218,0.02415
3,88928,1,281370.35,7831,574,0,"[0.07224658, 0.01950507, 0.01632774, 0.0257298...",8,26.2_68,3,...,0.026546,-0.114093,0.093395,0.047376,-0.060443,0.010879,-0.024136,0.025733,0.009218,0.02415
4,88928,1,281370.35,7831,7749,0,"[0.07224658, 0.01950507, 0.01632774, 0.0257298...",8,26.2_68,3,...,0.026546,-0.114093,0.093395,0.047376,-0.060443,0.010879,-0.024136,0.025733,0.009218,0.02415


In [18]:
# отсортируем по длине массива
df_submission = df_submission.reindex(df_submission.apply(len).sort_values(
    ascending=False).index)

In [19]:
def supplier_data(df_train: pd.DataFrame, df_test: pd.DataFrame, 
                  sup: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Filters train and test DataFrames based on the unique reg_code of a given supplier.
    Removes unnecessary columns and duplicates.
    Drops purchases that exist in both train and test DataFrames.
    
    Parameters:
    -----------
    df_train: pd.DataFrame
    The train DataFrame
    df_test: pd.DataFrame
    The test DataFrame
    sup: str
    The name of the supplier to filter the DataFrames by

    Returns:
    --------
    Tuple[pd.DataFrame, pd.DataFrame]
    A tuple of filtered DataFrames for train and test, respectively.
    """


    unique_reg_okpd = df_train[df_train['supplier'] == sup]['reg_code'].unique()
    
    # фильтруем train и test на основе уникальных reg_code поставщиков 
    df_sup_train = df_train[df_train['reg_code'].isin(unique_reg_okpd)]
    df_sup_test = df_test[df_test['reg_code'].isin(unique_reg_okpd)]

    
    if df_sup_test.empty:
        df_sup_test = df_test
    
    # удаляем ненужные для системы рекомендаций стобцы и дубликаты
    df_sup_train = df_sup_train.drop(columns=train['drop_columns']).drop_duplicates()
    df_sup_test = df_sup_test.drop(columns=train['drop_columns']).drop_duplicates()
    

    df_sup_test = df_sup_test.set_index('purchase')
    df_sup_train = df_sup_train.set_index('purchase')
    
    # удаляем закупки, которые есть и test, и в train
    df_sup_train = df_sup_train.drop(set(df_submission[sup]).intersection(df_sup_train.index))
    df_sup_test = df_sup_test[~df_sup_test.index.isin(df_sup_train.index)]
    
    
    return df_sup_train, df_sup_test    

In [20]:
def train_lgbm(df_tr: pd.DataFrame, df_t: pd.DataFrame, sup, **kwargs) -> pd.DataFrame:
    """
    Trains a LightGBM classifier on the training set and returns the score on the test set.

    Args:
        df_tr (pandas.DataFrame): A pandas DataFrame containing the training set.
        df_t (pandas.DataFrame): A pandas DataFrame containing the test set.

    Returns:
        pandas.DataFrame: The score on the test set.

    """

    x_train = df_tr[df_tr.columns[:-1]]
    y_train = df_tr['target']
    
    x_test = df_t[df_tr.columns[:-1]]
    y_test = df_t['target']
    

    model = LGBMClassifier(class_weight='balanced', 
                           n_jobs=-1, 
                           **kwargs)
    
    model.fit(x_train, y_train)

    y_pred = model.predict(x_test)
    y_score = model.predict_proba(x_test)
    

    return get_metrics(y_test, y_pred, y_score, name=sup)

In [21]:
base_metrics = pd.DataFrame()

for sup in tqdm(df_submission.index[:100]):
    
    # выделяем train и test для поставщика
    df_sup_train, df_sup_test = supplier_data(df_train, df_test, sup)

    
    # добавляем метки для обучения алгоритма классификации 
    df_sup_train['target'] = df_sup_train.index.isin(df_train[df_train['supplier'] == sup]['purchase']
                                                     .unique()).astype(int)
    df_sup_test['target'] = df_sup_test.index.isin(df_submission[sup]).astype(int)

    
    metric = train_lgbm(df_sup_train, df_sup_test, sup, random_state=train['random_state'])
    
    
    base_metrics = pd.concat([base_metrics, metric], ignore_index=True)

  0%|          | 0/100 [00:00<?, ?it/s]

In [22]:
base_metrics = base_metrics.set_index('model')

In [28]:
base_metrics.mean()

Accuracy     0.914658
ROC_AUC      0.920691
Precision    0.507862
Recall       0.601296
f1           0.511545
Logloss      0.209811
dtype: float64

In [62]:
recommender_metrics['basic_metrics'] = base_metrics.mean().to_dict()

# Tune params

In [24]:
def objective(trial: optuna.Trial, x: pd.DataFrame, y: pd.Series, **kwargs) -> float:
    """
    This function defines the objective function for an Optuna study to tune hyperparameters
    for a LightGBM binary classification model. 

    Args:
        trial (optuna.Trial): A trial corresponding to a set of hyperparameters.
        x (pd.DataFrame): The features to be used for training and validation.
        y (pd.Series): The target variable for training and validation.

    Returns:
        float: The mean of the cross-validation AUC-ROC scores for the given set of hyperparameters.
    """
    params = {
        'n_estimators': trial.suggest_categorical('n_estimators', [400]),
#         'learning_rate': trial.suggest_float('learning_rate', 0.001, 5),
        'learning_rate': trial.suggest_categorical('learning_rate', [kwargs['learning_rate']]),
        'max_bin': trial.suggest_int('max_bin', 10, 120, step=10),
        'num_leaves': trial.suggest_int('num_leaves', 10, 500, step=20),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 100, 5000, step=100),
        'lambda_l1': trial.suggest_int('lambda_l1', 0, 100),
        'lambda_l2': trial.suggest_int('lambda_l2', 0, 100),
        'min_split_gain': trial.suggest_float('min_split_gain', 0, 0.1),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.3, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.3, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 2, 6),
        'extra_trees': trial.suggest_categorical('extra_trees', [True, False]),
        'objective':'binary',
        'metric': 'auc',
        'random_state': train['random_state'],
    }

    cv_pred = np.empty(train['N_FOLDS'])
    cv = StratifiedKFold(n_splits=train['N_FOLDS'], shuffle=True, random_state=train['random_state'])

    for fold, (train_idx, test_idx) in enumerate(cv.split(x, y)):
        x_train_, x_val_ = x.iloc[train_idx], x.iloc[test_idx]
        y_train_, y_val_ = y.iloc[train_idx], y.iloc[test_idx]

        pruning = optuna.integration.LightGBMPruningCallback(trial, 'auc')

        model = LGBMClassifier(
            class_weight='balanced',
            n_jobs=-1,
            **params
        )
        model.fit(x_train_, y_train_,
                  eval_metric='auc',
                  eval_set=[(x_val_, y_val_)],
                  early_stopping_rounds=100,
                  callbacks=[pruning],
                  verbose=-1)

        y_pred = model.predict(x_val_)
        y_proba = model.predict_proba(x_val_)[:, 1]

        cv_pred[fold] = roc_auc_score(y_val_, y_proba)
        
    return (np.mean(cv_pred))

In [25]:
def tune_model(df_train: pd.DataFrame, df_test: pd.DataFrame, 
               df_submission: pd.DataFrame, sup, **kwargs) -> pd.DataFrame:
    
    # определяем train и test для поставщика
    df_sup_train, df_sup_test = supplier_data(df_train, df_test, sup)
    
    # добавляем метки для обучения алгоритма классификации 
    df_sup_train['target'] = df_sup_train.index.isin(df_train[df_train['supplier'] == sup]['purchase']
                                                     .unique()).astype(int)
    df_sup_test['target'] = df_sup_test.index.isin(df_submission[sup]).astype(int)
    
    # добавляем метки класса
    x_train = df_sup_train[df_sup_train.columns[:-1]]
    y_train = df_sup_train['target']
        
    func = lambda trial: objective(trial, x_train, y_train, **kwargs)

    study = optuna.create_study(direction="maximize")
    study.optimize(func, n_trials=50, n_jobs=-1)
        
    
    return study.best_params

In [130]:
for sup in tqdm(df_submission.index[:100]):
    
    recommender_params[sup] = tune_model(df_train, df_test, df_submission, 
                                         sup, random_state=train['random_state'],
                                         learning_rate=recommender_params[sup]['learning_rate'])

  0%|          | 0/100 [00:00<?, ?it/s]




KeyboardInterrupt



In [34]:
save_file(train['params'], recommender_params)

# Best params

In [30]:
metrics = pd.DataFrame()

for sup in tqdm(df_submission.index[:100]):

    # определяем датасеты для поставщика
    df_sup_train, df_sup_test = supplier_data(df_train, df_test, sup)

    # добавляем метки для обучения алгоритма классификации 
    df_sup_train['target'] = df_sup_train.index.isin(
        df_train[df_train['supplier'] == sup]['purchase'].unique()).astype(int)
    df_sup_test['target'] = df_sup_test.index.isin(df_submission[sup]).astype(int)

    
    metrics = pd.concat([metrics, train_lgbm(df_sup_train, df_sup_test, sup, 
                                             **recommender_params[sup])], ignore_index=True)
    
metrics.set_index('model', inplace=True)

  0%|          | 0/100 [00:00<?, ?it/s]





In [33]:
np.mean(metrics)

Accuracy     0.916946
ROC_AUC      0.923602
Precision    0.562924
Recall       0.575114
f1           0.508384
Logloss      0.263665
dtype: float64

In [72]:
recommender_metrics['best_metrics'] = np.mean(metrics).to_dict()

In [74]:
# Сохранение метрик
with open(train['metrics'], 'w') as file:
    yaml.dump(recommender_metrics, file)

## Save models

In [35]:
models = {}

In [36]:
for sup in tqdm(df_submission.index[:100]):

    # определяем датасеты для поставщика
    df_sup_train, df_sup_test = supplier_data(df_train, df_test, sup)

    # добавляем метки для обучения алгоритма классификации 
    df_sup_train['target'] = df_sup_train.index.isin(
        df_train[df_train['supplier'] == sup]['purchase'].unique()).astype(int)

    x_train = df_sup_train[df_sup_train.columns[:-1]]
    y_train = df_sup_train['target']
    

    model = LGBMClassifier(class_weight='balanced', n_jobs=-1, 
                           **recommender_params[sup])
    model.fit(x_train, y_train)
    
    
    models[sup] = model
    
joblib.dump(models, train['models'])

  0%|          | 0/100 [00:00<?, ?it/s]

['/home/sergey/projects/zakupki/models/models/recommender_models.joblib']