In [1]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, \
                            recall_score, f1_score, log_loss, precision_recall_curve


from lightgbm import LGBMClassifier

import optuna

from typing import Tuple, Set

import yaml

import joblib

import warnings
warnings.filterwarnings('ignore')

In [2]:
pd.options.mode.chained_assignment = None
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [3]:
def extract_purchase(string):
    return list(map(int, string[1:-1].replace(',', ' ').split()))

In [4]:
def extract_vector(string):
    return list(map(float, string[1:-1].split()))

In [5]:
def get_metrics(y_test, y_pred, y_score, name = "Default"):
    """Метрики для задачи классификации"""
    df_metrics = pd.DataFrame()

    df_metrics['model'] = [name]
    df_metrics['Accuracy'] = accuracy_score(y_test, y_pred)
    
    try:
        df_metrics['ROC_AUC'] = roc_auc_score(y_test, y_score[:, 1])
        df_metrics['Precision'] = precision_score(y_test, y_pred, zero_division=0)
        df_metrics['Recall'] = recall_score(y_test, y_pred, zero_division=0)
        df_metrics['f1'] = f1_score(y_test, y_pred, zero_division=0)
        df_metrics['Logloss'] = log_loss(y_test, y_score)
        
    except ValueError:
        df_metrics['ROC_AUC'] = 0
        df_metrics['Precision'] = 0
        df_metrics['Recall'] = 0
        df_metrics['f1'] = 0
        df_metrics['Logloss'] = 0

    return df_metrics

In [6]:
def open_file(file_path):
    with open(file_path, 'r') as file:
        return yaml.safe_load(file)
    
def save_file(file_path, data):       
    with open(file_path, 'w') as file:
        yaml.dump(data, file)

In [63]:
config_path = "../config/params.yaml"
config = yaml.load(open(config_path), Loader=yaml.FullLoader)

preproc = config["preprocessing"]
train = config["train"]

In [64]:
recommender_params = open_file(train['recommender_params'])

# Baseline

In [9]:
df_train = pd.read_csv(preproc['train_data'])
df_train = df_train.set_index('index')

df_train[:5]

Unnamed: 0_level_0,purchase,forsmallbiz,price,customer,supplier,is_winner,vectorized_tokens,month,reg_code,purchase_size,flag_won,n_unique_okpd2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,248042,0,60706.6,3513,15708,0,[ 1.59887652e-02 2.36658968e-02 -4.08423709e-...,11,21.2_33,6,0.0,3
1,569110,1,42138.0,6570,15708,1,[ 0.07180544 0.01209602 0.00167402 0.044857...,11,20.5_53,3,0.0,3
2,254021,1,65000.1,3377,15708,0,[ 0.02361675 0.01336928 -0.00032915 -0.005782...,5,21.2_33,6,0.0,3
3,447931,1,341584.8,9732,15708,0,[ 3.30867594e-02 4.09576579e-02 -5.67706811e-...,3,21.2_77,5,0.0,3
4,395694,1,133452.0,9589,15708,0,[ 0.10526875 0.11765645 -0.01359718 -0.009087...,9,21.2_77,2,0.0,3


In [10]:
df_test = pd.read_csv(preproc['test_data'])
df_test = df_test.set_index('index')

df_test[:5]

Unnamed: 0_level_0,purchase,forsmallbiz,price,customer,supplier,is_winner,vectorized_tokens,month,reg_code,purchase_size,flag_won,n_unique_okpd2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,63456,0,290000.0,6151,4841,1,[ 1.48227703e-02 2.26276631e-02 1.17342828e-...,7,10.8_45,1,0.0,2
1,41232,0,160044.4,3932,560,1,[ 5.70505795e-02 -1.79046954e-04 2.02757507e-...,8,19.2_34,1,1.0,2
2,120554,0,145530.0,9076,43003,0,[ 8.90472124e-02 2.44880769e-01 -6.71891250e-...,6,drug_92,2,0.0,4
3,120554,0,145530.0,9076,3109,1,[ 8.90472124e-02 2.44880769e-01 -6.71891250e-...,6,drug_92,2,1.0,4
4,594728,1,369175.6,9582,3796,1,[ 1.12817446e-01 1.13674459e-01 -2.37492026e-...,1,10.8_77,1,1.0,5


In [11]:
df_submission = pd.read_csv(preproc['submission'])
df_submission = df_submission.set_index('index')

df_submission[:5]

Unnamed: 0_level_0,purchases
index,Unnamed: 1_level_1
1,"[148757, 151442, 1729, 2435]"
2,"[20, 84, 5, 40, 141478]"
3,"[157339, 146909, 439, 535081, 145879, 163157, ..."
4,"[145658, 2500, 119, 167738, 147017, 535911, 63..."
5,"[151376, 149822, 2044, 2211, 165407, 145868, 1..."


In [12]:
df_submission = df_submission['purchases'].apply(extract_purchase)

In [13]:
df_train['vectorized_tokens'] = df_train['vectorized_tokens'].apply(extract_vector)
df_test['vectorized_tokens'] = df_test['vectorized_tokens'].apply(extract_vector)

In [14]:
df_train = df_train.astype(preproc['change_type_columns'])
df_test = df_test.astype(preproc['change_type_columns'])

In [15]:
for i in tqdm(range(100)):
    df_train[str(i)] = df_train['vectorized_tokens'].apply(lambda x: x[i])
    df_test[str(i)] = df_test['vectorized_tokens'].apply(lambda x: x[i])

  0%|          | 0/100 [00:00<?, ?it/s]

In [16]:
df_train[:5]

Unnamed: 0_level_0,purchase,forsmallbiz,price,customer,supplier,is_winner,vectorized_tokens,month,reg_code,purchase_size,...,90,91,92,93,94,95,96,97,98,99
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,248042,0,60706.6,3513,15708,0,"[0.0159887652, 0.0236658968, -0.00408423709, 0...",11,21.2_33,6,...,0.000632,0.000724,0.00189,-0.011886,0.005517,-0.001968,-0.006715,0.006948,0.016332,0.007574
1,569110,1,42138.0,6570,15708,1,"[0.07180544, 0.01209602, 0.00167402, 0.0448579...",11,20.5_53,3,...,-0.02485,0.015296,-0.018323,0.002229,0.001943,0.025595,-0.020818,-0.03007,0.020462,0.004984
2,254021,1,65000.1,3377,15708,0,"[0.02361675, 0.01336928, -0.00032915, -0.00578...",5,21.2_33,6,...,-0.01714,-0.039204,-0.025173,-0.019795,-0.022008,0.021673,-0.001356,-0.029046,0.039415,0.022089
3,447931,1,341584.8,9732,15708,0,"[0.0330867594, 0.0409576579, -0.00567706811, 0...",3,21.2_77,5,...,0.015603,-0.007513,0.002328,-0.052313,0.048505,0.00519,-0.069433,-0.027423,0.06171,-0.017328
4,395694,1,133452.0,9589,15708,0,"[0.10526875, 0.11765645, -0.01359718, -0.00908...",9,21.2_77,2,...,0.012777,0.013308,0.012679,0.005998,-0.004233,-0.018031,0.002359,0.002426,-0.03764,-0.029648


In [17]:
df_test[:5]

Unnamed: 0_level_0,purchase,forsmallbiz,price,customer,supplier,is_winner,vectorized_tokens,month,reg_code,purchase_size,...,90,91,92,93,94,95,96,97,98,99
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,63456,0,290000.0,6151,4841,1,"[0.0148227703, 0.0226276631, 0.00117342828, 0....",7,10.8_45,1,...,-0.0031,-0.004375,-0.009155,0.005988,0.001251,-0.000401,-0.004618,-0.003855,0.000981,0.009047
1,41232,0,160044.4,3932,560,1,"[0.0570505795, -0.000179046954, 0.0202757507, ...",8,19.2_34,1,...,-0.002678,0.006296,-0.007461,-0.016408,0.011742,-0.001584,0.009195,-0.012561,0.002538,0.007157
2,120554,0,145530.0,9076,43003,0,"[0.0890472124, 0.244880769, -0.067189125, 0.01...",6,drug_92,2,...,-0.003414,0.024904,0.015252,0.004285,-0.002967,-0.009345,-0.000871,0.000271,0.013623,-0.010615
3,120554,0,145530.0,9076,3109,1,"[0.0890472124, 0.244880769, -0.067189125, 0.01...",6,drug_92,2,...,-0.003414,0.024904,0.015252,0.004285,-0.002967,-0.009345,-0.000871,0.000271,0.013623,-0.010615
4,594728,1,369175.6,9582,3796,1,"[0.112817446, 0.113674459, -0.0237492026, 0.00...",1,10.8_77,1,...,0.012659,0.013646,0.006325,0.017674,-0.023067,-0.02115,-0.027321,-0.004798,-0.028115,-0.009645


In [21]:
def supplier_data(df_train, df_test, sup):
    unique_reg_okpd = df_train[df_train['supplier'] == sup]['reg_code'].unique()
    
    # фильтруем train и test на основе уникальных reg_code поставщиков 
    df_sup_train = df_train[df_train['reg_code'].isin(unique_reg_okpd)]
    df_sup_test = df_test[df_test['reg_code'].isin(unique_reg_okpd)]

    
    if df_sup_test.empty:
        df_sup_test = df_test
    
    # удаляем ненужные для системы рекомендаций стобцы и дубликаты
    df_sup_train = df_sup_train.drop(columns=train['drop_columns_recommender']) \
                               .drop_duplicates()
    df_sup_test = df_sup_test.drop(columns=train['drop_columns_recommender']) \
                             .drop_duplicates()
    

    df_sup_test = df_sup_test.set_index('purchase')
    df_sup_train = df_sup_train.set_index('purchase')
    
    # удаляем закупки, которые есть и test, и в train
    df_sup_train = df_sup_train.drop(set(df_submission[sup]).intersection(df_sup_train.index))
    df_sup_test = df_sup_test[~df_sup_test.index.isin(df_sup_train.index)]
    
    
    return df_sup_train, df_sup_test    

In [19]:
def train_lgbm(df_tr: pd.DataFrame, df_t: pd.DataFrame, sup, **kwargs) -> pd.DataFrame:
    """
    Trains a LightGBM classifier on the training set and returns the score on the test set.

    Args:
        df_tr (pandas.DataFrame): A pandas DataFrame containing the training set.
        df_t (pandas.DataFrame): A pandas DataFrame containing the test set.

    Returns:
        pandas.DataFrame: The score on the test set.

    """

    x_train = df_tr[df_tr.columns[:-1]]
    y_train = df_tr['target']
    
    x_test = df_t[df_tr.columns[:-1]]
    y_test = df_t['target']
    

    model = LGBMClassifier(random_state=train['random_state'], 
                           class_weight='balanced', 
                           n_jobs=-1, 
                           **kwargs)
    model.fit(x_train, y_train)

    y_pred = model.predict(x_test)
    y_score = model.predict_proba(x_test)
    

    return get_metrics(y_test, y_pred, y_score, name=sup)

In [22]:
base_metrics = pd.DataFrame()

for sup in tqdm(df_submission.index[:500]):
    
    # выделяем train и test для поставщика
    df_sup_train, df_sup_test = supplier_data(df_train, df_test, sup)


    # добавляем метки для обучения алгоритма классификации 
    df_sup_train['target'] = df_sup_train.index.isin(df_train[df_train['supplier'] == sup]['purchase']
                                                     .unique()).astype(int)
    df_sup_test['target'] = df_sup_test.index.isin(df_submission[sup]).astype(int)

    
    metric = train_lgbm(df_sup_train, df_sup_test, sup)
    
    
    base_metrics = pd.concat([base_metrics, metric], ignore_index=True)
    

base_metrics.set_index('model', inplace=True)
base_metrics.mean(axis=0)

  0%|          | 0/500 [00:00<?, ?it/s]

Accuracy     0.948149
ROC_AUC      0.802157
Precision    0.493933
Recall       0.264286
f1           0.318773
Logloss      0.235905
dtype: float64

# Tune params

In [24]:
def objective(trial: optuna.Trial, x: pd.DataFrame, y: pd.Series, **kwargs) -> float:
    """
    This function defines the objective function for an Optuna study to tune hyperparameters
    for a LightGBM binary classification model. 

    Args:
        trial (optuna.Trial): A trial corresponding to a set of hyperparameters.
        x (pd.DataFrame): The features to be used for training and validation.
        y (pd.Series): The target variable for training and validation.

    Returns:
        float: The mean of the cross-validation AUC-ROC scores for the given set of hyperparameters.
    """
    params = {
#         'n_estimators': trial.suggest_int('n_estimators', 50, 400, step=10),
#         'learning_rate': trial.suggest_float('learning_rate', 0.001, 1),
        'n_estimators': trial.suggest_categorical('n_estimators', [kwargs['n_estimators']]),
        'learning_rate': trial.suggest_categorical('learning_rate', [kwargs['learning_rate']]),
        'max_bin': trial.suggest_int('max_bin', 0, 120, step=10),
        'num_leaves': trial.suggest_int('num_leaves', 20, 500, step=20),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 100, 5000, step=100),
        'lambda_l1': trial.suggest_int('lambda_l1', 0, 100),
        'lambda_l2': trial.suggest_int('lambda_l2', 0, 100),
        'min_split_gain': trial.suggest_float('min_split_gain', 0, 0.1),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.3, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.3, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 2, 6),
        'extra_trees': trial.suggest_categorical('extra_trees', [True, False]),
        'objective':'binary',
        'metric': 'auc',
        'random_state': train['random_state'],
    }

    cv_pred = np.empty(train['N_FOLDS'])
    cv = StratifiedKFold(n_splits=train['N_FOLDS'], shuffle=True, random_state=train['random_state'])

    for fold, (train_idx, test_idx) in enumerate(cv.split(x, y)):
        x_train_, x_val_ = x.iloc[train_idx], x.iloc[test_idx]
        y_train_, y_val_ = y.iloc[train_idx], y.iloc[test_idx]

        pruning = optuna.integration.LightGBMPruningCallback(trial, 'auc')

        model = LGBMClassifier(
            class_weight='balanced',
            n_jobs=-1,
            **params
        )
        model.fit(x_train_, y_train_,
                  eval_metric='auc',
                  eval_set=[(x_val_, y_val_)],
                  early_stopping_rounds=100,
                  callbacks=[pruning],
                  verbose=-1)

        y_pred = model.predict(x_val_)
        y_proba = model.predict_proba(x_val_)[:, 1]

        cv_pred[fold] = roc_auc_score(y_val_, y_proba)
        
    return (np.mean(cv_pred))

In [25]:
def tune_model(df_train: pd.DataFrame, df_test: pd.DataFrame, 
               df_submission: pd.DataFrame, sup, **kwargs) -> pd.DataFrame:
    
    # определяем train и test для поставщика
    df_sup_train, df_sup_test = supplier_data(df_train, df_test)
    
    # добавляем метки класса
    x_train = df_sup_train[df_tr.columns[:-1]]
    y_train = df_sup_train['target']
        
    func = lambda trial: objective(trial, x_train, y_train, **kwargs)

    study = optuna.create_study(direction="maximize")
    study.optimize(func, n_trials=10, n_jobs=-1)
        
    
    return study.best_params

In [None]:
for sup in tqdm(df_submission.index[:500]):
    
    base_auc = base_metrics.loc[sup, 'ROC_AUC']
    
    # проверяем, есть ли смысл тюнить модель
    if base_auc == 1 or base_auc == 0:
        recommender_params[sup] = {}
        continue
    
    recommender_params[sup] = tune_model(df_train, df_test, df_submission, 
                                             sup, **recommender_params[sup])

In [None]:
save_file(train['recommender_params'], recommender_params)

In [61]:
untuned_models = metrics[metrics['ROC_AUC'] < base_metrics['ROC_AUC']].index

In [None]:
for sup in tqdm(untuned_models):
    
    recommender_params[sup] = tune_model(df_train, df_test, df_submission, 
                                         sup, **params[sup])

По некоторым поставщикам недостаточно данных, из-за чего подобрать параметры на валидации затруднительно. Поэтому для тех поставщиков, у которых после подбора параметров метрики на test ухудшились, оставим baseline для того, чтобы оценить качество оптимизации для других.

In [72]:
for sup in tqdm(metrics.index):
    
    if (metrics[metrics.index == sup].loc[sup,'ROC_AUC'] < 
        base_metrics[base_metrics.index == sup].loc[sup,'ROC_AUC']):
        
        recommender_params[sup] = {}

  0%|          | 0/500 [00:00<?, ?it/s]

In [79]:
save_file(train['recommender_params'], recommender_params)

# Best params

In [31]:
metrics = pd.DataFrame()

for sup in tqdm(df_submission.index[:500]):

    # определяем датасеты для поставщика
    df_sup_train, df_sup_test = supplier_data(df_train, df_test, sup)

    # добавляем метки для обучения алгоритма классификации 
    df_sup_train['target'] = df_sup_train.index.isin(
        df_train[df_train['supplier'] == sup]['purchase'].unique()).astype(int)
    df_sup_test['target'] = df_sup_test.index.isin(df_submission[sup]).astype(int)

    
    metrics = pd.concat([metrics, train_lgbm(df_sup_train, df_sup_test, sup, 
                                             **recommender_params[sup])], ignore_index=True)
    
metrics.set_index('model', inplace=True)

  0%|          | 0/500 [00:00<?, ?it/s]





In [32]:
np.mean(metrics)

Accuracy     0.917486
ROC_AUC      0.823923
Precision    0.409688
Recall       0.503448
f1           0.374905
Logloss      0.240113
dtype: float64

In [59]:
models = {}

In [62]:
for sup in tqdm(df_submission.index[:500]):

    # определяем датасеты для поставщика
    df_sup_train, df_sup_test = supplier_data(df_train, df_test, sup)

    # добавляем метки для обучения алгоритма классификации 
    df_sup_train['target'] = df_sup_train.index.isin(
        df_train[df_train['supplier'] == sup]['purchase'].unique()).astype(int)

    x_train = df_sup_train[df_sup_train.columns[:-1]]
    y_train = df_sup_train['target']
    

    model = LGBMClassifier(random_state=train['random_state'], 
                           class_weight='balanced', 
                           n_jobs=-1, 
                           **recommender_params[sup])
    model.fit(x_train, y_train)
    
    
    models[sup] = model
    
joblib.dump(models, train['recommender_models'])

  0%|          | 0/500 [00:00<?, ?it/s]



['../config/recommender_models.joblib']