In [1]:
import pandas as pd
import numpy as np

import yaml

from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, \
                            recall_score, f1_score, log_loss, precision_recall_curve

import matplotlib.pyplot as plt

from lightgbm import LGBMClassifier

import optuna

from typing import Tuple, Set

import warnings
warnings.filterwarnings('ignore')

RAND=42

N_FOLDS = 4

In [2]:
pd.options.mode.chained_assignment = None
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [3]:
def extract_words(string):
    return list(map(float, string[1:-1].split()))

In [4]:
def get_metrics(y_test, y_pred, y_score, name = "Default"):
    """Метрики для задачи классификации"""
    df_metrics = pd.DataFrame()

    df_metrics['model'] = [name]
    df_metrics['Accuracy'] = accuracy_score(y_test, y_pred)
    df_metrics['ROC_AUC'] = roc_auc_score(y_test, y_score[:, 1])
    df_metrics['Precision'] = precision_score(y_test, y_pred, zero_division=0)
    df_metrics['Recall'] = recall_score(y_test, y_pred, zero_division=0)
    df_metrics['f1'] = f1_score(y_test, y_pred, zero_division=0)
    df_metrics['Logloss'] = log_loss(y_test, y_score)

    return df_metrics

In [5]:
config_path = "../config/params.yaml"
config = yaml.load(open(config_path), Loader=yaml.FullLoader)

preproc = config["preprocessing"]
train = config["train"]

In [6]:
config_model = "../config/model_params/lightgbm.yaml"

model_params = yaml.load(open(config_path), Loader=yaml.FullLoader)

# Baseline

In [7]:
df_train = pd.read_csv(preproc['train_data'])
df_train = df_train.set_index('index')

df_train[:5]

Unnamed: 0_level_0,purchase,forsmallbiz,price,customer,supplier,is_winner,vectorized_tokens,month,reg_code,purchase_size,flag_won,n_unique_okpd2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,3,1,290000.0,2,1,1,[ 0.18815488 0.1963165 0.08348706 0.132998...,2,58.2_2,1,0.0,4
1,2834,1,105000.0,218,1,1,[ 1.88778124e-01 1.99460707e-01 8.44090815e-...,2,62.0_2,1,0.0,4
2,154949,1,98967.5,11235,1,1,[ 0.17555872 0.0838882 0.01939559 0.047119...,2,63.9_2,1,0.0,4
3,147950,0,77460.03,11061,1,1,[ 2.09548737e-01 1.98348963e-01 3.00821184e-...,3,62.0_2,1,0.0,4
4,165990,0,138000.0,11558,1,1,[ 1.55703006e-01 1.47389050e-01 4.36386056e-...,3,62.0_2,1,0.0,4


In [8]:
df_test = pd.read_csv(preproc['test_data'])
df_test = df_test.set_index('index')

df_test[:5]

Unnamed: 0_level_0,purchase,forsmallbiz,price,customer,supplier,is_winner,vectorized_tokens,month,reg_code,purchase_size,flag_won,n_unique_okpd2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,1064,1,181720.0,70,1,1,[ 1.77900028e-01 8.02768195e-02 1.25833983e-...,11,58.2_2,1,0.0,4.0
1,1729,1,167448.0,105,1,1,[ 2.70711475e-01 8.42926477e-02 4.42017172e-...,12,62.0_2,1,0.0,4.0
2,2430,1,200248.16,201,1,1,[ 2.59523023e-01 1.20019309e-01 7.98608701e-...,12,62.0_2,1,0.0,4.0
3,156898,0,190740.0,11266,1,1,[ 0.20131386 0.07292083 -0.01833528 -0.017387...,12,62.0_2,1,0.0,4.0
4,412017,1,2886156.0,9651,100009,0,[ 5.40626203e-02 7.06050537e-02 -3.14055514e-...,11,27.4_77,1,0.0,1.0


In [9]:
df_train['vectorized_tokens'] = df_train['vectorized_tokens'].apply(extract_words)
df_test['vectorized_tokens'] = df_test['vectorized_tokens'].apply(extract_words)

In [10]:
df_train = df_train.astype(preproc['change_type_columns'])
df_test = df_test.astype(preproc['change_type_columns'])

In [11]:
for i in tqdm(range(100)):
    df_train[str(i)] = df_train['vectorized_tokens'].apply(lambda x: x[i])
    df_test[str(i)] = df_test['vectorized_tokens'].apply(lambda x: x[i])

  0%|          | 0/100 [00:00<?, ?it/s]

In [23]:
def find_closest_vectors(target_vectors: np.ndarray, vector_space: np.ndarray, 
                         k: int =5) -> np.ndarray:
    """
    Find the k vectors in the given vector_space that are closest to the given target_vectors.

    Args:
        target_vectors (numpy.ndarray): A 2D numpy array of shape (n, m) 
            where n is the number of target vectors and m is the dimensionality of the vectors.
        vector_space (numpy.ndarray): A 2D numpy array of shape (p, m) 
            where p is the number of vectors in the vector space and m is the 
            dimensionality of the vectors.
        k (int): The number of closest vectors to return. Default is 5.

    Returns:
        numpy.ndarray: A 1D numpy array of length k containing the indices of 
        the closest vectors in the vector space.
    """
    sum_distances = []
    
    
    mms = MinMaxScaler()
    
    
    #Нормализуем векторы
    target_vectors = mms.fit_transform(target_vectors)
    vector_space = mms.transform(vector_space)
    
    
    for vector in vector_space:
        # создаем массив расстояний между вектором в пространстве и целевыми векторами
        distances = np.linalg.norm(target_vectors - vector, axis=1)
        # сохраняем сумму расстояний между вектором в пространстве и целевыми векторами
        sum_distances.append(np.sum(distances))
    
    
    # выбираем набор векторов с минимальной суммой расстояний до каждого вектора в target_vectors
    if k >= vector_space.shape[0]:
        return np.nonzero(vector_space)
    
    
    return np.argpartition(sum_distances, k)[:k]

In [25]:
def baseline_lgbm(df_tr: pd.DataFrame, df_t: pd.DataFrame, params = None) -> pd.DataFrame:
    """
    Trains a LightGBM classifier on the training set and returns the score on the test set.

    Args:
        df_tr (pandas.DataFrame): A pandas DataFrame containing the training set.
        df_t (pandas.DataFrame): A pandas DataFrame containing the test set.

    Returns:
        pandas.DataFrame: The score on the test set.

    """

    x_train = df_tr[df_tr.columns[:-1]]
    y_train = df_tr['target']
    
    x_test = df_t[df_tr.columns[:-1]]
    y_test = df_t['target']
    

    model = LGBMClassifier(random_state=RAND, class_weight='balanced', n_jobs=-1)
    model.fit(x_train, y_train)

    y_pred = model.predict(x_test)
    y_score = model.predict_proba(x_test)

    return get_metrics(y_test, y_pred, y_score)

In [13]:
def filter_data(df_train, df_sup, sup):
    unique_reg_okpd = df_train[df_train['supplier'] == sup]['reg_code'].unique()
    
    # фильтруем train и test на основе уникальных reg_code поставщиков 
    df_sup_train = df_train[df_train['reg_code'].isin(unique_reg_okpd)]
    df_sup_test = df_test[df_test['reg_code'].isin(unique_reg_okpd)]
    
    if df_sup_test.empty:
        df_sup_test = df_test
        
    return df_sup_train, df_sup_test    

In [26]:
def edit_data(df_train: pd.DataFrame, df_test: pd.DataFrame, 
              df: pd.DataFrame, sup: int) -> Tuple[pd.DataFrame, pd.DataFrame, Set]:
    """
    The function filters the training and test datasets of a supplier with a given 
    reg_code and returns the filtered dataframes as well as the set of participations 
    of this supplier in the original dataset.

    Args:

        df_train: pandas.DataFrame - the training dataset
        df_test: pandas.DataFrame - the test dataset
        df: pandas.DataFrame - the original dataset
        sup: str - the registration code of the supplier

    Returns:

        Tuple[pandas.DataFrame, pandas.DataFrame, set] - a tuple of the filtered 
        dataframes of the training and test datasets, as well as the set of participations 
        of the supplier.
    """
    unique_reg_okpd = df_train[df_train['supplier'] == sup]['reg_code'].unique()
    
    # фильтруем train и test на основе уникальных reg_code поставщиков 
    df_sup_train = df_train[df_train['reg_code'].isin(unique_reg_okpd)]
    df_sup_test = df_test[df_test['reg_code'].isin(unique_reg_okpd)]
    
    # выделяем все участия поставщика
    df_sup_true = set(df[df['supplier'] == sup]['purchase'])
    
    # проверяем, остались ли участия на test после фильтрации
    if df_sup_test['purchase'].isin(df_sup_true).nunique() < 2:
        df_sup_test = df_test.copy()
        
    # удаляем выделенные столбцы и дубликаты
    df_sup_train = df_sup_train.drop(columns=columns_to_drop).drop_duplicates()
    df_sup_test = df_sup_test.drop(columns=columns_to_drop).drop_duplicates()
    
    df_sup_test = df_sup_test.set_index('purchase')
    df_sup_train = df_sup_train.set_index('purchase')
    
    return df_sup_train, df_sup_test, df_sup_true

In [27]:
def rec_algorithm(df_train: pd.DataFrame, df_test: pd.DataFrame, 
                  df_submission: pd.DataFrame, df: pd.DataFrame) -> pd.DataFrame:
    """
    Recommends items for the first 500 suppliers in df_submission based on 
    training data in df_train and df.
    Uses the LightGBM classifier and the baseline_lgbm function to make predictions.

    Args:
    - df_train (pd.DataFrame): Training data with columns 'purchase', 'supplier', 'reg_code', and 'target'.
    - df_test (pd.DataFrame): Test data with columns 'purchase', 'supplier', 'reg_code'.
    - df_submission (pd.DataFrame): Dataframe with the first 500 suppliers for which to recommend items.
    - df (pd.DataFrame): Dataframe with columns 'purchase', 'supplier', 'reg_code'.

    Returns:
    - metrics (pd.DataFrame): Mean of the ROC AUC score for each supplier in 
    df_submission after running the baseline_lgbm function.
    """
    metrics = pd.DataFrame()

    for sup in tqdm(df_submission.index[:500]):

        # подготавливаем данные к обучению
        df_sup_train, df_sup_test, df_sup_true = edit_data(df_train, df_test, df, sup)

        # добавляем метки для обучения алгоритма классификации 
        df_sup_train['target'] = df_sup_train.index.isin(df_sup_true).astype(int)
        df_sup_test['target'] = df_sup_test.index.isin(df_sup_true).astype(int)

        metrics = pd.concat([metrics, baseline_lgbm(df_sup_train, df_sup_test)])

    return metrics.mean(axis=0)

In [28]:
def alg_nearest(df_train: pd.DataFrame, df_test: pd.DataFrame, 
                df_submission_less_4: pd.DataFrame, df: pd.DataFrame) -> float:
    """
    Computes the average recall score for the top 500 rows of df_submission_less_4 
    using the nearest neighbor algorithm.

    Args:
    - df_train: training data DataFrame
    - df_test: testing data DataFrame
    - df_submission_less_4: submission data DataFrame with purchase information
    - df: DataFrame containing information about the suppliers

    Returns:
    - float: the average recall score for the top 500 rows of df_submission_less_4
    """

    recall_less_4 = []

    for sup in tqdm(df_submission_less_4.index[:500]):
        # подготавливаем данные к обучению
        df_sup_train, df_sup_test, df_sup_true = edit_data(df_train, df_test, df, sup)

        # выделяем векторы участия поставщика на train
        vectors = df_sup_train[df_sup_train.index.isin(df_sup_true)]

        # определяем векторы наиболее близки для поставщика на test
        idx = find_closest_vectors(vectors, df_sup_test)
        y_pred = set(df_sup_test.iloc[idx].index)

        recall = len(y_pred & df_submission_less_4.loc[sup]['purchases']) \
                 / len(df_submission_less_4.loc[sup]['purchases'])

        recall_less_4.append(recall)

    return (np.mean(recall_less_4))

In [None]:
metrics = pd.DataFrame()

for sup in tqdm(df_submission.index[:500]):

    # подготавливаем данные к обучению
    df_sup_train, df_sup_test, df_sup_true = edit_data(df_train, df_test, df, sup)

    # добавляем метки для обучения алгоритма классификации 
    df_sup_train['target'] = df_sup_train.index.isin(df_sup_true).astype(int)
    df_sup_test['target'] = df_sup_test.index.isin(df_sup_true).astype(int)

    metrics = pd.concat([metrics, baseline_lgbm(df_sup_train, df_sup_test)])

return metrics.mean(axis=0)

In [87]:
rec_algorithm(df_train, df_test, df_submission, df)

  0%|          | 0/500 [00:00<?, ?it/s]

  return metrics.mean(axis=0)


Accuracy     0.956422
ROC_AUC      0.841957
Precision    0.241158
Recall       0.266748
f1           0.219859
Logloss      0.196172
dtype: float64

In [103]:
alg_nearest(df_train, df_test, df_submission_less_4, df)

  0%|          | 0/500 [00:00<?, ?it/s]

0.3411196863246043

# Tune

In [28]:
def objective(trial: optuna.Trial, x: pd.DataFrame, y: pd.Series, learning_rate=None) -> float:
    """
    This function defines the objective function for an Optuna study to tune hyperparameters
    for a LightGBM binary classification model. 

    Args:
        trial (optuna.Trial): A trial corresponding to a set of hyperparameters.
        x (pd.DataFrame): The features to be used for training and validation.
        y (pd.Series): The target variable for training and validation.

    Returns:
        float: The mean of the cross-validation AUC-ROC scores for the given set of hyperparameters.
    """
    params = {
        'n_estimators': trial.suggest_categorical('n_estimators', [400]),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
#         'learning_rate': trial.suggest_categorical('learning_rate', [learning_rate]),
#         'max_bin': trial.suggest_int('max_bin', 200, 800),
#         'num_leaves': trial.suggest_int('num_leaves', 20, 150),
#         'max_depth': trial.suggest_int('max_depth', 4, 12),
#         'min_child_samples': trial.suggest_int('min_child_samples', 200, 20000, step=200),
#         'lambda_l1': trial.suggest_int('lambda_l1', 0, 50),
#         'lambda_l2': trial.suggest_int('lambda_l2', 0, 50),
#         'min_split_gain': trial.suggest_float('min_split_gain', 0.001, 0.1),
#         'objective': 'binary',
#         'metric': 'auc',
#         'feature_fraction': trial.suggest_float('feature_fraction', 0.3, 1.0),
#         'bagging_fraction': trial.suggest_float('bagging_fraction', 0.3, 1.0),
#         'bagging_freq': trial.suggest_int('bagging_freq', 2, 6),
#         'random_state': RAND,
    }

    cv_pred = np.empty(N_FOLDS)
    cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)

    for fold, (train_idx, test_idx) in enumerate(cv.split(x, y)):
        x_train_, x_val_ = x.iloc[train_idx], x.iloc[test_idx]
        y_train_, y_val_ = y.iloc[train_idx], y.iloc[test_idx]

        pruning = optuna.integration.LightGBMPruningCallback(trial, 'auc')

        model = LGBMClassifier(
            class_weight='balanced',
            n_jobs=-1,
            **params
        )
        model.fit(x_train_, y_train_,
                  eval_metric='auc',
                  eval_set=[(x_val_, y_val_)],
                  early_stopping_rounds=100,
                  callbacks=[pruning],
                  verbose=0)

        y_pred = model.predict(x_val_)
        y_proba = model.predict_proba(x_val_)[:, 1]

        cv_pred[fold] = roc_auc_score(y_val_, y_proba)
    return (np.mean(cv_pred))

In [33]:
def tune_lgbm(df_tr: pd.DataFrame, df_t: pd.DataFrame, learning_rate=None) -> pd.DataFrame:
    
    x_train = df_tr[df_tr.columns[:-1]]
    y_train = df_tr['target']
    
    x_test = df_t[df_tr.columns[:-1]]
    y_test = df_t['target']
     
        
    func = lambda trial: objective(trial, x_train, y_train, learning_rate)

    study = optuna.create_study(direction="maximize")
    study.optimize(func, n_trials=50, n_jobs=-1)
    
    return study.best_params

In [32]:
def open_file(file_path):
    with open(file_path, 'r') as file:
        return yaml.safe_load(file)
    
def save_file(file_path, data):       
    with open(file_path, 'w') as file:
        yaml.dump(data, file)

In [31]:
def tune_algorithm(df_train: pd.DataFrame, df_test: pd.DataFrame, 
                   df_submission: pd.DataFrame, df: pd.DataFrame, sup, learning_rate=None):
    
    # подготавливаем данные к обучению
    df_sup_train, df_sup_test, df_sup_true = edit_data(df_train, df_test, df, sup)

    # добавляем метки для обучения алгоритма классификации 
    df_sup_train['target'] = df_sup_train.index.isin(df_sup_true).astype(int)
    df_sup_test['target'] = df_sup_test.index.isin(df_sup_true).astype(int)

    params = tune_lgbm(df_sup_train, df_sup_test, sup, learning_rate)
        
    return params

In [30]:
file_path = "../config/model_params.yaml"

In [34]:
data = open_file(file_path)

for sup in tqdm(df_submission.index[:500]):

    if sup in data:
        params = tune_algorithm(df_train, df_test, df_submission, df, sup, **data[sup])
        data[sup].extend(params)
        
    # Если ключа еще нет, создать новый список
    else:
        params = tune_algorithm(df_train, df_test, df_submission, df, sup)        
        data[sup] = params

save_file(file_path, data)

  0%|          | 0/500 [00:00<?, ?it/s]





In [29]:
def classifier_lgbm(df_tr: pd.DataFrame, df_t: pd.DataFrame, params = None) -> pd.DataFrame:
    """
    Trains a LightGBM classifier on the training set and returns the score on the test set.

    Args:
        df_tr (pandas.DataFrame): A pandas DataFrame containing the training set.
        df_t (pandas.DataFrame): A pandas DataFrame containing the test set.

    Returns:
        pandas.DataFrame: The score on the test set.

    """

    x_train = df_tr[df_tr.columns[:-1]]
    y_train = df_tr['target']
    
    x_test = df_t[df_tr.columns[:-1]]
    y_test = df_t['target']
    

    model = LGBMClassifier(class_weight='balanced', n_jobs=-1, **params)
    model.fit(x_train, y_train)

    y_pred = model.predict(x_test)
    y_score = model.predict_proba(x_test)

    return get_metrics(y_test, y_pred, y_score)

In [33]:
data = open_file(file_path)

In [35]:
metrics = pd.DataFrame()

for sup in tqdm(df_submission.index[:500]):

    # подготавливаем данные к обучению
    df_sup_train, df_sup_test, df_sup_true = edit_data(df_train, df_test, df, sup)

    # добавляем метки для обучения алгоритма классификации 
    df_sup_train['target'] = df_sup_train.index.isin(df_sup_true).astype(int)
    df_sup_test['target'] = df_sup_test.index.isin(df_sup_true).astype(int)
    
    params = data[sup]

    metrics = pd.concat([metrics, classifier_lgbm(df_sup_train, df_sup_test, params=params)])

  0%|          | 0/500 [00:00<?, ?it/s]





In [36]:
metrics.mean(axis=0)

Accuracy     0.730974
ROC_AUC      0.743162
Precision    0.090660
Recall       0.638708
f1           0.139746
Logloss      0.436611
dtype: float64