In [115]:
import pandas as pd
import numpy as np

import yaml

from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, \
                            recall_score, f1_score, log_loss, precision_recall_curve

import matplotlib.pyplot as plt

from lightgbm import LGBMClassifier

from typing import Tuple, Set

RAND=42

In [2]:
pd.options.mode.chained_assignment = None

In [3]:
def extract_words(string):
    return list(map(float, string[1:-1].split()))

In [71]:
def get_metrics(y_test, y_pred, y_score, name = "Default"):
    """Метрики для задачи классификации"""
    df_metrics = pd.DataFrame()

    df_metrics['model'] = [name]
    df_metrics['Accuracy'] = accuracy_score(y_test, y_pred)
    df_metrics['ROC_AUC'] = roc_auc_score(y_test, y_score[:, 1])
    df_metrics['Precision'] = precision_score(y_test, y_pred, zero_division=0)
    df_metrics['Recall'] = recall_score(y_test, y_pred, zero_division=0)
    df_metrics['f1'] = f1_score(y_test, y_pred, zero_division=0)
    df_metrics['Logloss'] = log_loss(y_test, y_score)

    return df_metrics

In [5]:
config_path = "../config/params.yaml"
config = yaml.load(open(config_path), Loader=yaml.FullLoader)

baseline = config["baseline"]

# Baseline

In [6]:
df = pd.read_csv("eda.csv")
df[:5]

Unnamed: 0.1,Unnamed: 0,purchase,min_publish_date,price,customer,supplier,is_winner,tokens,month,flag_won,vectorized_tokens,forsmallbiz,reg_code
0,0,3,2021-02-02,290000.0,2,1,1,"['оказание', 'лицензионный', 'заказчик', 'обес...",2,0,[ 0.09995163 -0.04592259 0.01378636 -0.108661...,1,58.2_2
1,1,2834,2021-02-02,105000.0,218,1,1,"['оказание', 'лицензионный', 'заказчик', 'проч...",2,0,[ 0.10110906 -0.0440736 0.01088561 -0.092925...,1,62.0_2
2,2,154949,2021-02-16,98967.5,11235,1,1,"['оказание', 'справочный', 'услуга', 'справочн...",2,0,[ 9.61258310e-02 -7.53206381e-02 1.30420232e-...,1,63.9_2
3,3,147950,2021-03-10,77460.03,11061,1,1,"['центральный', 'нужда', 'оказание', 'область'...",3,0,[ 1.81690110e-01 -7.98087106e-02 2.36332247e-...,0,62.0_2
4,4,165990,2021-03-26,138000.0,11558,1,1,"['использование', 'год', 'услуга', 'поддержка'...",3,0,[ 8.36613937e-02 -3.61296042e-02 1.18028555e-...,0,62.0_2


In [7]:
df = df.drop(columns=['Unnamed: 0'])

In [9]:
df['vectorized_tokens'] = df['vectorized_tokens'].apply(extract_words)

In [10]:
df = df.astype(baseline['change_type_columns'])

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 958129 entries, 0 to 958128
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   purchase           958129 non-null  int64         
 1   min_publish_date   958129 non-null  datetime64[ns]
 2   price              958129 non-null  float64       
 3   customer           958129 non-null  category      
 4   supplier           958129 non-null  category      
 5   is_winner          958129 non-null  int64         
 6   tokens             958129 non-null  object        
 7   month              958129 non-null  int64         
 8   flag_won           958129 non-null  int64         
 9   vectorized_tokens  958129 non-null  object        
 10  forsmallbiz        958129 non-null  int64         
 11  reg_code           958129 non-null  category      
dtypes: category(3), datetime64[ns](1), float64(1), int64(5), object(2)
memory usage: 76.5+ MB


In [12]:
for i in tqdm(range(100)):
    df[str(i)] = df['vectorized_tokens'].apply(lambda x: x[i])

  0%|          | 0/100 [00:00<?, ?it/s]

  df[str(i)] = df['vectorized_tokens'].apply(lambda x: x[i])
  df[str(i)] = df['vectorized_tokens'].apply(lambda x: x[i])
  df[str(i)] = df['vectorized_tokens'].apply(lambda x: x[i])
  df[str(i)] = df['vectorized_tokens'].apply(lambda x: x[i])
  df[str(i)] = df['vectorized_tokens'].apply(lambda x: x[i])
  df[str(i)] = df['vectorized_tokens'].apply(lambda x: x[i])
  df[str(i)] = df['vectorized_tokens'].apply(lambda x: x[i])
  df[str(i)] = df['vectorized_tokens'].apply(lambda x: x[i])
  df[str(i)] = df['vectorized_tokens'].apply(lambda x: x[i])


In [13]:
df = df.drop(columns=['tokens', 'vectorized_tokens'])

In [14]:
df[:5]

Unnamed: 0,purchase,min_publish_date,price,customer,supplier,is_winner,month,flag_won,forsmallbiz,reg_code,...,90,91,92,93,94,95,96,97,98,99
0,3,2021-02-02,290000.0,2,1,1,2,0,1,58.2_2,...,0.017627,0.016926,-0.049357,-0.026286,0.02097,0.003175,-0.028066,0.008443,0.001411,-0.034929
1,2834,2021-02-02,105000.0,218,1,1,2,0,1,62.0_2,...,0.046094,0.007618,-0.027536,-0.053111,0.011831,0.011928,0.014621,0.018501,-0.002532,-0.042839
2,154949,2021-02-16,98967.5,11235,1,1,2,0,1,63.9_2,...,0.001504,-0.014368,-0.003083,-0.02338,-0.011952,-0.015312,0.001604,-0.010423,0.008548,0.007586
3,147950,2021-03-10,77460.03,11061,1,1,3,0,0,62.0_2,...,0.04739,0.011939,-0.026689,-0.026909,0.010685,0.032261,0.011898,-0.028451,0.02132,-0.018372
4,165990,2021-03-26,138000.0,11558,1,1,3,0,0,62.0_2,...,0.029461,0.00906,0.002067,-0.056179,-0.007963,0.008658,0.010949,-0.01563,0.029524,-0.03878


In [16]:
# удаляем колонки, которые не известны при рекомендациях
df = df.drop(columns=['flag_won', 'is_winner'])

In [18]:
# определяем train выборку
df_train = df[df['min_publish_date'] < '2022-11-01']
df_train[:5]

Unnamed: 0,purchase,min_publish_date,price,customer,supplier,month,forsmallbiz,reg_code,0,1,...,90,91,92,93,94,95,96,97,98,99
0,3,2021-02-02,290000.0,2,1,2,1,58.2_2,0.099952,-0.045923,...,0.017627,0.016926,-0.049357,-0.026286,0.02097,0.003175,-0.028066,0.008443,0.001411,-0.034929
1,2834,2021-02-02,105000.0,218,1,2,1,62.0_2,0.101109,-0.044074,...,0.046094,0.007618,-0.027536,-0.053111,0.011831,0.011928,0.014621,0.018501,-0.002532,-0.042839
2,154949,2021-02-16,98967.5,11235,1,2,1,63.9_2,0.096126,-0.075321,...,0.001504,-0.014368,-0.003083,-0.02338,-0.011952,-0.015312,0.001604,-0.010423,0.008548,0.007586
3,147950,2021-03-10,77460.03,11061,1,3,0,62.0_2,0.18169,-0.079809,...,0.04739,0.011939,-0.026689,-0.026909,0.010685,0.032261,0.011898,-0.028451,0.02132,-0.018372
4,165990,2021-03-26,138000.0,11558,1,3,0,62.0_2,0.083661,-0.03613,...,0.029461,0.00906,0.002067,-0.056179,-0.007963,0.008658,0.010949,-0.01563,0.029524,-0.03878


In [19]:
# определяем test выборку
df_test = df[df['min_publish_date'] >= '2022-11-01']
df_test[:5]

Unnamed: 0,purchase,min_publish_date,price,customer,supplier,month,forsmallbiz,reg_code,0,1,...,90,91,92,93,94,95,96,97,98,99
17,1064,2022-11-15,181720.0,70,1,11,1,58.2_2,0.118956,-0.058592,...,0.003299,0.007779,-0.044355,-0.016624,0.022317,0.028108,-0.003703,-0.005069,0.000613,-0.026491
18,1729,2022-12-06,167448.0,105,1,12,1,62.0_2,0.102237,-0.078737,...,0.029983,0.000158,0.012667,-0.072771,-0.004045,-0.000979,-0.015035,-0.00266,0.014919,-0.056247
19,2430,2022-12-08,200248.16,201,1,12,1,62.0_2,0.133928,-0.085101,...,0.014602,0.00846,-0.003175,-0.079399,-0.011833,0.022215,0.00758,-0.013096,0.027716,-0.062906
20,156898,2022-12-12,190740.0,11266,1,12,0,62.0_2,0.069836,-0.065891,...,0.020553,-0.020859,0.035928,-0.060225,-0.030322,0.010242,-0.003675,-0.002666,0.031446,-0.026175
76,412017,2022-11-07,2886156.0,9651,100009,11,1,27.4_77,0.102239,-0.057708,...,0.017813,-0.030042,-0.058707,-0.019106,-0.023805,0.015103,0.020688,0.023299,0.016912,-0.038975


In [20]:
# определяем поставщиков, которые есть в test, но отсутствуют в train
sup_to_remove = set(df_test['supplier'].unique()) - set(df_train['supplier'].unique())

In [33]:
df_submission = df_test.groupby('supplier')['purchase'].apply(set).to_frame(name='purchases')
# удаляем поставщиков, которые появились на платформе в последние 2 месяца
df_submission = df_submission.drop(sup_to_remove, axis=0)
df_submission[:5]

Unnamed: 0_level_0,purchases
supplier,Unnamed: 1_level_1
1,"{1064, 1729, 156898, 2430}"
2,"{141478, 536503}"
3,{149878}
4,{}
5,{154789}


In [37]:
# удаляем поставщиков, которых нет на test
df_submission = df_submission.loc[df_submission['purchases'] != set()]
df_submission[:5]

Unnamed: 0_level_0,purchases
supplier,Unnamed: 1_level_1
1,"{1064, 1729, 156898, 2430}"
2,"{141478, 536503}"
3,{149878}
5,{154789}
6,"{41633, 539818, 481931, 40178, 443698, 535989,..."


In [50]:
def extract_less_4(row):
    return df_train[df_train['supplier'] == row.name].shape[0] < 4

# выделим поставщиков df_submission, содержащих менее 4 участий на train
df_submission_less_4 = df_submission[df_submission.apply(extract_less_4, axis=1)]
df_submission_less_4[:5]

Unnamed: 0_level_0,purchases
supplier,Unnamed: 1_level_1
160,{415}
178,"{153344, 145730, 147043, 141508, 149879, 16401..."
244,{408}
263,{151070}
290,{3052}


In [124]:
def find_closest_vectors(target_vectors: np.ndarray, vector_space: np.ndarray, 
                         k: int =5) -> np.ndarray:
    """
    Find the k vectors in the given vector_space that are closest to the given target_vectors.

    Args:
        target_vectors (numpy.ndarray): A 2D numpy array of shape (n, m) 
            where n is the number of target vectors and m is the dimensionality of the vectors.
        vector_space (numpy.ndarray): A 2D numpy array of shape (p, m) 
            where p is the number of vectors in the vector space and m is the 
            dimensionality of the vectors.
        k (int): The number of closest vectors to return. Default is 5.

    Returns:
        numpy.ndarray: A 1D numpy array of length k containing the indices of 
        the closest vectors in the vector space.
    """
    sum_distances = []
    
    
    mms = MinMaxScaler()
    
    
    #Нормализуем векторы
    target_vectors = mms.fit_transform(target_vectors)
    vector_space = mms.transform(vector_space)
    
    
    for vector in vector_space:
        # создаем массив расстояний между вектором в пространстве и целевыми векторами
        distances = np.linalg.norm(target_vectors - vector, axis=1)
        # сохраняем сумму расстояний между вектором в пространстве и целевыми векторами
        sum_distances.append(np.sum(distances))
    
    
    # выбираем набор векторов с минимальной суммой расстояний до каждого вектора в target_vectors
    if k >= vector_space.shape[0]:
        return np.nonzero(vector_space)
    
    
    return np.argpartition(sum_distances, k)[:k]

In [25]:
# определяем столбцы для удаления перед обучением
columns_to_drop = ['supplier', 'min_publish_date', 'customer', 'reg_code']

In [83]:
def baseline_lgbm(df_tr: pd.DataFrame, df_t: pd.DataFrame) -> pd.DataFrame:
    """
    Trains a LightGBM classifier on the training set and returns the score on the test set.

    Args:
        df_tr (pandas.DataFrame): A pandas DataFrame containing the training set.
        df_t (pandas.DataFrame): A pandas DataFrame containing the test set.

    Returns:
        pandas.DataFrame: The score on the test set.

    """

    x_train = df_tr[df_tr.columns[:-1]]
    y_train = df_tr['target']
    
    x_test = df_t[df_tr.columns[:-1]]
    y_test = df_t['target']

    model = LGBMClassifier(random_state=RAND, class_weight='balanced', n_jobs=-1)
    model.fit(x_train, y_train)

    y_pred = model.predict(x_test)
    y_score = model.predict_proba(x_test)

    return get_metrics(y_test, y_pred, y_score)

In [121]:
def edit_data(df_train: pd.DataFrame, df_test: pd.DataFrame, 
              df: pd.DataFrame, sup: int) -> Tuple[pd.DataFrame, pd.DataFrame, Set]:
    """
    The function filters the training and test datasets of a supplier with a given 
    reg_code and returns the filtered dataframes as well as the set of participations 
    of this supplier in the original dataset.

    Args:

        df_train: pandas.DataFrame - the training dataset
        df_test: pandas.DataFrame - the test dataset
        df: pandas.DataFrame - the original dataset
        sup: str - the registration code of the supplier

    Returns:

        Tuple[pandas.DataFrame, pandas.DataFrame, set] - a tuple of the filtered 
        dataframes of the training and test datasets, as well as the set of participations 
        of the supplier.
    """
    unique_reg_okpd = df_train[df_train['supplier'] == sup]['reg_code'].unique()
    
    # фильтруем train и test на основе уникальных reg_code поставщиков 
    df_sup_train = df_train[df_train['reg_code'].isin(unique_reg_okpd)]
    df_sup_test = df_test[df_test['reg_code'].isin(unique_reg_okpd)]
    
    # выделяем все участия поставщика
    df_sup_true = set(df[df['supplier'] == sup]['purchase'])
    
    # проверяем, остались ли участия на test после фильтрации
    if df_sup_test['purchase'].isin(df_sup_true).nunique() < 2:
        df_sup_test = df_test.copy()
        
    # удаляем выделенные столбцы и дубликаты
    df_sup_train = df_sup_train.drop(columns=columns_to_drop).drop_duplicates()
    df_sup_test = df_sup_test.drop(columns=columns_to_drop).drop_duplicates()
    
    df_sup_test = df_sup_test.set_index('purchase')
    df_sup_train = df_sup_train.set_index('purchase')
    
    return df_sup_train, df_sup_test, df_sup_true

In [122]:
def rec_algorithm(df_train: pd.DataFrame, df_test: pd.DataFrame, 
                  df_submission: pd.DataFrame, df: pd.DataFrame) -> pd.DataFrame:
    """
    Recommends items for the first 500 suppliers in df_submission based on 
    training data in df_train and df.
    Uses the LightGBM classifier and the baseline_lgbm function to make predictions.

    Args:
    - df_train (pd.DataFrame): Training data with columns 'purchase', 'supplier', 'reg_code', and 'target'.
    - df_test (pd.DataFrame): Test data with columns 'purchase', 'supplier', 'reg_code'.
    - df_submission (pd.DataFrame): Dataframe with the first 500 suppliers for which to recommend items.
    - df (pd.DataFrame): Dataframe with columns 'purchase', 'supplier', 'reg_code'.

    Returns:
    - metrics (pd.DataFrame): Mean of the ROC AUC score for each supplier in 
    df_submission after running the baseline_lgbm function.
    """
    metrics = pd.DataFrame()

    for sup in tqdm(df_submission.index[:500]):

        # подготавливаем данные к обучению
        df_sup_train, df_sup_test, df_sup_true = edit_data(df_train, df_test, df, sup)

        # добавляем метки для обучения алгоритма классификации 
        df_sup_train['target'] = df_sup_train.index.isin(df_sup_true).astype(int)
        df_sup_test['target'] = df_sup_test.index.isin(df_sup_true).astype(int)

        metrics = pd.concat([metrics, baseline_lgbm(df_sup_train, df_sup_test)])

    return metrics.mean(axis=0)

In [123]:
def alg_nearest(df_train: pd.DataFrame, df_test: pd.DataFrame, 
                df_submission_less_4: pd.DataFrame, df: pd.DataFrame) -> float:
    """
    Computes the average recall score for the top 500 rows of df_submission_less_4 
    using the nearest neighbor algorithm.

    Args:
    - df_train: training data DataFrame
    - df_test: testing data DataFrame
    - df_submission_less_4: submission data DataFrame with purchase information
    - df: DataFrame containing information about the suppliers

    Returns:
    - float: the average recall score for the top 500 rows of df_submission_less_4
    """

    recall_less_4 = []

    for sup in tqdm(df_submission_less_4.index[:500]):
        # подготавливаем данные к обучению
        df_sup_train, df_sup_test, df_sup_true = edit_data(df_train, df_test, df, sup)

        # выделяем векторы участия поставщика на train
        vectors = df_sup_train[df_sup_train.index.isin(df_sup_true)]

        # определяем векторы наиболее близки для поставщика на test
        idx = find_closest_vectors(vectors, df_sup_test)
        y_pred = set(df_sup_test.iloc[idx].index)

        recall = len(y_pred & df_submission_less_4.loc[sup]['purchases']) \
                 / len(df_submission_less_4.loc[sup]['purchases'])

        recall_less_4.append(recall)

    return (np.mean(recall_less_4))

In [87]:
rec_algorithm(df_train, df_test, df_submission, df)

  0%|          | 0/500 [00:00<?, ?it/s]

  return metrics.mean(axis=0)


Accuracy     0.956422
ROC_AUC      0.841957
Precision    0.241158
Recall       0.266748
f1           0.219859
Logloss      0.196172
dtype: float64

In [103]:
alg_nearest(df_train, df_test, df_submission_less_4, df)

  0%|          | 0/500 [00:00<?, ?it/s]

0.3411196863246043