In [None]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, \
                            recall_score, f1_score, log_loss

from lightgbm import LGBMClassifier

import optuna

from typing import Tuple, Set

import yaml

import joblib

import re

import warnings
warnings.filterwarnings('ignore')

In [None]:
pd.options.mode.chained_assignment = None
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [None]:
def extract_purchases(string):
    return list(map(int, re.findall(r"'(\d+)'", string)))

In [None]:
def extract_vector(string):
    return list(map(float, string[1:-1].split()))

In [None]:
def get_metrics(y_test, y_pred, y_score, name = "Default"):
    """Метрики для задачи классификации"""
    df_metrics = pd.DataFrame()

    df_metrics['model'] = [name]
    df_metrics['Accuracy'] = accuracy_score(y_test, y_pred)
    
    try:
        df_metrics['ROC_AUC'] = roc_auc_score(y_test, y_score[:, 1])
        df_metrics['Precision'] = precision_score(y_test, y_pred, zero_division=0)
        df_metrics['Recall'] = recall_score(y_test, y_pred, zero_division=0)
        df_metrics['f1'] = f1_score(y_test, y_pred, zero_division=0)
        df_metrics['Logloss'] = log_loss(y_test, y_score)
        
    except ValueError:
        df_metrics['ROC_AUC'] = 0
        df_metrics['Precision'] = 0
        df_metrics['Recall'] = 0
        df_metrics['f1'] = 0
        df_metrics['Logloss'] = 0

    return df_metrics

In [None]:
def open_file(file_path):
    with open(file_path, 'r') as file:
        return yaml.safe_load(file)
    
def save_file(file_path, data):       
    with open(file_path, 'w') as file:
        yaml.dump(data, file)

In [None]:
config_path = "../config/params.yaml"
config = yaml.load(open(config_path), Loader=yaml.FullLoader)

preproc = config["preprocessing"]
train = config["train"]['recommender']

In [None]:
recommender_params = open_file(train['params'])

In [None]:
recommender_metrics = {}

# Baseline

В данном блоке мы строим и обучаем рекомендательные систему. Также подбираем параметры для моделей с помощью байесовского оптимизатора.  

Результатом этого блока являются файл с моделью на каждого поставщика, лучшими параметрами и метриками качества моделей.

In [None]:
df_train = pd.read_csv(preproc['train_data'])
df_train = df_train.set_index('index')

df_train[:5]

In [None]:
df_test = pd.read_csv(preproc['test_data'])
df_test = df_test.set_index('index')

df_test[:5]

In [None]:
df_submission = pd.read_csv(preproc['recommend_sub_path'])
df_submission = df_submission.set_index('index')

df_submission[:5]

In [None]:
df_submission = df_submission['purchases'].apply(extract_purchases)

In [None]:
# Генерация признаков на основе вектора токенов
df_train['vectorized'] = df_train['vectorized'].apply(extract_vector)
df_test['vectorized'] = df_test['vectorized'].apply(extract_vector)

In [None]:
# Преобразование типов столбцов
df_train = df_train.astype(preproc['change_type_columns'])
df_test = df_test.astype(preproc['change_type_columns'])

In [None]:
# Преобрауем вектор в признаки объекта
for i in tqdm(range(100)):
    df_train[str(i)] = df_train['vectorized'].apply(lambda x: x[i])
    df_test[str(i)] = df_test['vectorized'].apply(lambda x: x[i])

In [None]:
df_train[:5]

In [None]:
df_test[:5]

In [None]:
# отсортируем по длине массива
df_submission = df_submission.reindex(df_submission.apply(len).sort_values(
    ascending=False).index)

In [None]:
def supplier_data(df_train: pd.DataFrame, df_test: pd.DataFrame, 
                  sup: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Filters train and test DataFrames based on the unique reg_code of a given supplier.
    Removes unnecessary columns and duplicates.
    Drops purchases that exist in both train and test DataFrames.
    
    Parameters:
    -----------
    df_train: pd.DataFrame
    The train DataFrame
    df_test: pd.DataFrame
    The test DataFrame
    sup: str
    The name of the supplier to filter the DataFrames by

    Returns:
    --------
    Tuple[pd.DataFrame, pd.DataFrame]
    A tuple of filtered DataFrames for train and test, respectively.
    """


    unique_reg_okpd = df_train[df_train['supplier'] == sup]['reg_code'].unique()
    
    # фильтруем train и test на основе уникальных reg_code поставщиков 
    df_sup_train = df_train[df_train['reg_code'].isin(unique_reg_okpd)]
    df_sup_test = df_test[df_test['reg_code'].isin(unique_reg_okpd)]

    
    if df_sup_test.empty:
        df_sup_test = df_test
    
    # удаляем ненужные для системы рекомендаций стобцы и дубликаты
    df_sup_train = df_sup_train.drop(columns=train['drop_columns']).drop_duplicates()
    df_sup_test = df_sup_test.drop(columns=train['drop_columns']).drop_duplicates()
    

    df_sup_test = df_sup_test.set_index('purchase')
    df_sup_train = df_sup_train.set_index('purchase')
    
    # удаляем закупки, которые есть и test, и в train
    df_sup_train = df_sup_train.drop(set(df_submission[sup]).intersection(df_sup_train.index))
    df_sup_test = df_sup_test[~df_sup_test.index.isin(df_sup_train.index)]
    
    
    return df_sup_train, df_sup_test    

In [None]:
def train_lgbm(df_tr: pd.DataFrame, df_t: pd.DataFrame, sup, **kwargs) -> pd.DataFrame:
    """
    Trains a LightGBM classifier on the training set and returns the score on the test set.

    Args:
        df_tr (pandas.DataFrame): A pandas DataFrame containing the training set.
        df_t (pandas.DataFrame): A pandas DataFrame containing the test set.

    Returns:
        pandas.DataFrame: The score on the test set.

    """

    x_train = df_tr[df_tr.columns[:-1]]
    y_train = df_tr['target']
    
    x_test = df_t[df_tr.columns[:-1]]
    y_test = df_t['target']
    

    model = LGBMClassifier(class_weight='balanced', 
                           n_jobs=-1, 
                           **kwargs)
    
    model.fit(x_train, y_train)

    y_pred = model.predict(x_test)
    y_score = model.predict_proba(x_test)
    

    return get_metrics(y_test, y_pred, y_score, name=sup)

In [None]:
base_metrics = pd.DataFrame()

for sup in tqdm(df_submission.index[:100]):
    
    # выделяем train и test для поставщика
    df_sup_train, df_sup_test = supplier_data(df_train, df_test, sup)

    
    # добавляем метки для обучения алгоритма классификации 
    df_sup_train['target'] = df_sup_train.index.isin(df_train[df_train['supplier'] == sup]['purchase']
                                                     .unique()).astype(int)
    df_sup_test['target'] = df_sup_test.index.isin(df_submission[sup]).astype(int)

    
    metric = train_lgbm(df_sup_train, df_sup_test, sup, random_state=train['random_state'])
    
    
    base_metrics = pd.concat([base_metrics, metric], ignore_index=True)

In [None]:
base_metrics = base_metrics.set_index('model')

In [None]:
base_metrics.mean()

In [None]:
recommender_metrics['basic_metrics'] = base_metrics.mean().to_dict()

# Tune params

In [None]:
def objective(trial: optuna.Trial, x: pd.DataFrame, y: pd.Series, **kwargs) -> float:
    """
    This function defines the objective function for an Optuna study to tune hyperparameters
    for a LightGBM binary classification model. 

    Args:
        trial (optuna.Trial): A trial corresponding to a set of hyperparameters.
        x (pd.DataFrame): The features to be used for training and validation.
        y (pd.Series): The target variable for training and validation.

    Returns:
        float: The mean of the cross-validation AUC-ROC scores for the given set of hyperparameters.
    """
    params = {
        'n_estimators': trial.suggest_categorical('n_estimators', [400]),
#         'learning_rate': trial.suggest_float('learning_rate', 0.001, 5),
        'learning_rate': trial.suggest_categorical('learning_rate', [kwargs['learning_rate']]),
        'max_bin': trial.suggest_int('max_bin', 10, 120, step=10),
        'num_leaves': trial.suggest_int('num_leaves', 10, 500, step=20),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 100, 5000, step=100),
        'lambda_l1': trial.suggest_int('lambda_l1', 0, 100),
        'lambda_l2': trial.suggest_int('lambda_l2', 0, 100),
        'min_split_gain': trial.suggest_float('min_split_gain', 0, 0.1),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.3, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.3, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 2, 6),
        'extra_trees': trial.suggest_categorical('extra_trees', [True, False]),
        'objective':'binary',
        'metric': 'auc',
        'random_state': train['random_state'],
    }

    cv_pred = np.empty(train['N_FOLDS'])
    cv = StratifiedKFold(n_splits=train['N_FOLDS'], shuffle=True, random_state=train['random_state'])

    for fold, (train_idx, test_idx) in enumerate(cv.split(x, y)):
        x_train_, x_val_ = x.iloc[train_idx], x.iloc[test_idx]
        y_train_, y_val_ = y.iloc[train_idx], y.iloc[test_idx]

        pruning = optuna.integration.LightGBMPruningCallback(trial, 'auc')

        model = LGBMClassifier(
            class_weight='balanced',
            n_jobs=-1,
            **params
        )
        model.fit(x_train_, y_train_,
                  eval_metric='auc',
                  eval_set=[(x_val_, y_val_)],
                  early_stopping_rounds=100,
                  callbacks=[pruning],
                  verbose=-1)

        y_pred = model.predict(x_val_)
        y_proba = model.predict_proba(x_val_)[:, 1]

        cv_pred[fold] = roc_auc_score(y_val_, y_proba)
        
    return (np.mean(cv_pred))

In [None]:
def tune_model(df_train: pd.DataFrame, df_test: pd.DataFrame, 
               df_submission: pd.DataFrame, sup, **kwargs) -> pd.DataFrame:
    
    # определяем train и test для поставщика
    df_sup_train, df_sup_test = supplier_data(df_train, df_test, sup)
    
    # добавляем метки для обучения алгоритма классификации 
    df_sup_train['target'] = df_sup_train.index.isin(df_train[df_train['supplier'] == sup]['purchase']
                                                     .unique()).astype(int)
    df_sup_test['target'] = df_sup_test.index.isin(df_submission[sup]).astype(int)
    
    # добавляем метки класса
    x_train = df_sup_train[df_sup_train.columns[:-1]]
    y_train = df_sup_train['target']
        
    func = lambda trial: objective(trial, x_train, y_train, **kwargs)

    study = optuna.create_study(direction="maximize")
    study.optimize(func, n_trials=50, n_jobs=-1)
        
    
    return study.best_params

In [None]:
for sup in tqdm(df_submission.index[:100]):
    
    recommender_params[sup] = tune_model(df_train, df_test, df_submission, 
                                         sup, random_state=train['random_state'],
                                         learning_rate=recommender_params[sup]['learning_rate'])

In [None]:
save_file(train['params'], recommender_params)

# Best params

In [None]:
metrics = pd.DataFrame()

for sup in tqdm(df_submission.index[:100]):

    # определяем датасеты для поставщика
    df_sup_train, df_sup_test = supplier_data(df_train, df_test, sup)

    # добавляем метки для обучения алгоритма классификации 
    df_sup_train['target'] = df_sup_train.index.isin(
        df_train[df_train['supplier'] == sup]['purchase'].unique()).astype(int)
    df_sup_test['target'] = df_sup_test.index.isin(df_submission[sup]).astype(int)

    
    metrics = pd.concat([metrics, train_lgbm(df_sup_train, df_sup_test, sup, 
                                             **recommender_params[sup])], ignore_index=True)
    
metrics.set_index('model', inplace=True)

In [None]:
np.mean(metrics)

In [None]:
recommender_metrics['best_metrics'] = np.mean(metrics).to_dict()

In [None]:
# Сохранение метрик
with open(train['metrics'], 'w') as file:
    yaml.dump(recommender_metrics, file)

## Save models

In [None]:
models = {}

In [None]:
for sup in tqdm(df_submission.index[:100]):

    # определяем датасеты для поставщика
    df_sup_train, df_sup_test = supplier_data(df_train, df_test, sup)

    # добавляем метки для обучения алгоритма классификации 
    df_sup_train['target'] = df_sup_train.index.isin(
        df_train[df_train['supplier'] == sup]['purchase'].unique()).astype(int)

    x_train = df_sup_train[df_sup_train.columns[:-1]]
    y_train = df_sup_train['target']
    

    model = LGBMClassifier(class_weight='balanced', n_jobs=-1, 
                           **recommender_params[sup])
    model.fit(x_train, y_train)
    
    
    models[sup] = model
    
joblib.dump(models, train['models'])