In [1]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

import yaml
import joblib

In [2]:
def extract_purchase(string):
    return list(map(int, string[1:-1].replace(',', ' ').split()))

In [3]:
config_path = "../config/params.yaml"
config = yaml.load(open(config_path), Loader=yaml.FullLoader)

preproc = config["preprocessing"]
train = config['train']
evaluate = config['evaluate']

# Evaluate

In [4]:
df_train = pd.read_csv(preproc['train_data'])
df_train = df_train.set_index('index')
df_train[:5]

Unnamed: 0_level_0,purchase,forsmallbiz,price,customer,supplier,is_winner,vectorized_tokens,month,reg_code,purchase_size,flag_won,n_unique_okpd2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,248042,0,60706.6,3513,15708,0,[ 1.59887652e-02 2.36658968e-02 -4.08423709e-...,11,21.2_33,6,0.0,3
1,569110,1,42138.0,6570,15708,1,[ 0.07180544 0.01209602 0.00167402 0.044857...,11,20.5_53,3,0.0,3
2,254021,1,65000.1,3377,15708,0,[ 0.02361675 0.01336928 -0.00032915 -0.005782...,5,21.2_33,6,0.0,3
3,447931,1,341584.8,9732,15708,0,[ 3.30867594e-02 4.09576579e-02 -5.67706811e-...,3,21.2_77,5,0.0,3
4,395694,1,133452.0,9589,15708,0,[ 0.10526875 0.11765645 -0.01359718 -0.009087...,9,21.2_77,2,0.0,3


In [5]:
df_evaluate = pd.read_csv(evaluate['evaluate_data'])
df_evaluate = df_evaluate.set_index('index')
df_evaluate[:5]

Unnamed: 0_level_0,purchase,forsmallbiz,price,customer,supplier,is_winner,vectorized_tokens,region_code,okpd2_code,min_publish_date
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
4879721,63456,0,290000.0,6151,4841,1,[ 1.48227703e-02 2.26276631e-02 1.17342828e-...,45,10.8,2022-07-28
3853006,41232,0,160044.4,3932,560,1,[ 5.70505795e-02 -1.79046954e-04 2.02757507e-...,34,19.2,2021-08-23
4368109,120554,0,145530.0,9076,43003,0,[ 8.90472124e-02 2.44880769e-01 -6.71891250e-...,92,drug,2021-06-03
3229373,594728,1,369175.6,9582,3796,1,[ 1.12817446e-01 1.13674459e-01 -2.37492026e-...,77,10.8,2021-01-27
122872,494970,1,1166720.0,9820,94223,0,[ 0.01410401 0.00283409 0.0025544 0.010392...,77,47.7,2021-09-13


In [6]:
df_submission = pd.read_csv(evaluate['submission'])
df_submission = df_submission.set_index('index')
df_submission[:5]

Unnamed: 0_level_0,purchases
index,Unnamed: 1_level_1
1,"[148757, 151442, 1729, 2435]"
2,"[20, 84, 5, 40, 141478]"
3,"[157339, 146909, 439, 535081, 145879, 163157, ..."
4,"[145658, 2500, 119, 167738, 147017, 535911, 63..."
5,"[151376, 149822, 2044, 2211, 165407, 145868, 1..."


In [7]:
df_submission = df_submission['purchases'].apply(extract_purchase)

In [8]:
def extract_words(string):
    return list(map(float, string[1:-1].split()))


def get_month(df_test):
    return df_test['min_publish_date'].apply(lambda x: int(x.split('-')[1]))


def add_reg_code(df_test):
    return df_test['okpd2_code'].astype('str') + '_' \
                 + df_test['region_code'].astype('str')


def add_purchase_size(df_test):
    return df_test.merge(df_test.groupby('purchase')['supplier'] \
                                .size().to_frame('purchase_size'), 
                         on='purchase', how='outer')

    
def add_flag(df_test, df_train):
    return df_test.merge(df_train[['supplier', 'customer', 'flag_won']] \
                                 .groupby(['supplier', 'customer']).tail(1), 
                         on=['supplier', 'customer'], how='left').fillna(0)


def add_unique_okpd2(df_test, df_train):
    return df_test.merge(df_train[['supplier', 'n_unique_okpd2']] \
                                 .groupby('supplier').tail(1), 
                         on=['supplier'], how='left').fillna(1)


def generate_features(df_test, df_train):
    df_test['month'] = get_month(df_test)
    df_test['reg_code'] = add_reg_code(df_test)
    df_test = add_purchase_size(df_test)
    df_test = add_flag(df_test, df_train)
    df_test = add_unique_okpd2(df_test, df_train)
    
    return df_test 

In [9]:
def pipeline_preproc(df_test, df_train):
    
    df_train['vectorized_tokens'] = df_train['vectorized_tokens'].apply(extract_words)
    df_test['vectorized_tokens'] = df_test['vectorized_tokens'].apply(extract_words)

    df_test = generate_features(df_test, df_train)

    df_test = df_test.drop(columns=preproc['drop_columns'])

    df_train = df_train.astype(preproc['change_type_columns'])
    df_test = df_test.astype(preproc['change_type_columns'])

    return df_test, df_train

In [10]:
df_evaluate, df_train = pipeline_preproc(df_evaluate, df_train)

In [11]:
df_evaluate[:5]

Unnamed: 0,purchase,forsmallbiz,price,customer,supplier,is_winner,vectorized_tokens,month,reg_code,purchase_size,flag_won,n_unique_okpd2
0,63456,0,290000.0,6151,4841,1,"[0.0148227703, 0.0226276631, 0.00117342828, 0....",7,10.8_45,1,0.0,2
1,41232,0,160044.4,3932,560,1,"[0.0570505795, -0.000179046954, 0.0202757507, ...",8,19.2_34,1,1.0,2
2,120554,0,145530.0,9076,43003,0,"[0.0890472124, 0.244880769, -0.067189125, 0.01...",6,drug_92,2,0.0,4
3,120554,0,145530.0,9076,3109,1,"[0.0890472124, 0.244880769, -0.067189125, 0.01...",6,drug_92,2,1.0,4
4,594728,1,369175.6,9582,3796,1,"[0.112817446, 0.113674459, -0.0237492026, 0.00...",1,10.8_77,1,1.0,5


## Recommender

In [13]:
def transform_vector(df_train, df_test):
    for i in range(100):
        df_train[str(i)] = df_train['vectorized_tokens'].apply(lambda x: x[i])
        df_test[str(i)] = df_test['vectorized_tokens'].apply(lambda x: x[i])
        
    return df_train, df_test

def supplier_data(df_train, df_test, df_submission, sup):
    unique_reg_okpd = df_train[df_train['supplier'] == sup]['reg_code'].unique()
    
    # фильтруем train и test на основе уникальных reg_code поставщиков 
    df_sup_train = df_train[df_train['reg_code'].isin(unique_reg_okpd)]
    df_sup_test = df_test[df_test['reg_code'].isin(unique_reg_okpd)]

    
    if df_sup_test.empty:
        df_sup_test = df_test
    
    # удаляем ненужные для системы рекомендаций стобцы и дубликаты
    df_sup_train = df_sup_train.drop(columns=train['drop_columns_recommender']) \
                               .drop_duplicates()
    df_sup_test = df_sup_test.drop(columns=train['drop_columns_recommender']) \
                             .drop_duplicates()
    

    df_sup_test = df_sup_test.set_index('purchase')
    df_sup_train = df_sup_train.set_index('purchase')
    
    # удаляем закупки, которые есть и test, и в train
    df_sup_train = df_sup_train.drop(set(df_submission[sup]).intersection(df_sup_train.index))
    df_sup_test = df_sup_test[~df_sup_test.index.isin(df_sup_train.index)]
    
    
    return df_sup_train, df_sup_test  

def get_supplier_recommends(df_train, df_evaluate, model, df_submission, sup):
    
    df_sup_evaluate = supplier_data(df_train, df_evaluate, df_submission, sup)[1]
    
    y_pred = model.predict(df_sup_evaluate)
    
    return df_sup_evaluate[y_pred==1].index.tolist()

def compute_recommendations(df_train, df_evaluate, df_submission, models, index):
    
    recommendations = pd.Series()
    
    df_train, df_evaluate = transform_vector(df_train, df_evaluate)
    
    for sup in tqdm(df_submission.index[:index]):
        recommendations[str(sup)] = get_supplier_recommends(df_train, 
                                                            df_evaluate, 
                                                            models[sup], 
                                                            df_submission, 
                                                            sup)
    return recommendations

In [14]:
models = joblib.load(train['recommender_models'])

In [15]:
recommendations = compute_recommendations(df_train.copy(), 
                                          df_evaluate.copy(), 
                                          df_submission, 
                                          models, 500)

  recommendations = pd.Series()
  df_train[str(i)] = df_train['vectorized_tokens'].apply(lambda x: x[i])
  df_test[str(i)] = df_test['vectorized_tokens'].apply(lambda x: x[i])
  df_train[str(i)] = df_train['vectorized_tokens'].apply(lambda x: x[i])
  df_test[str(i)] = df_test['vectorized_tokens'].apply(lambda x: x[i])
  df_train[str(i)] = df_train['vectorized_tokens'].apply(lambda x: x[i])
  df_test[str(i)] = df_test['vectorized_tokens'].apply(lambda x: x[i])


  0%|          | 0/500 [00:00<?, ?it/s]

## Probability of winning

In [20]:
df_evaluate[:5]

Unnamed: 0,purchase,forsmallbiz,price,customer,supplier,vectorized_tokens,month,reg_code,purchase_size,flag_won,n_unique_okpd2
0,63456,0,290000.0,6151,4841,"[0.0148227703, 0.0226276631, 0.00117342828, 0....",7,10.8_45,1,0.0,2
1,41232,0,160044.4,3932,560,"[0.0570505795, -0.000179046954, 0.0202757507, ...",8,19.2_34,1,1.0,2
2,120554,0,145530.0,9076,43003,"[0.0890472124, 0.244880769, -0.067189125, 0.01...",6,drug_92,2,0.0,4
3,120554,0,145530.0,9076,3109,"[0.0890472124, 0.244880769, -0.067189125, 0.01...",6,drug_92,2,1.0,4
4,594728,1,369175.6,9582,3796,"[0.112817446, 0.113674459, -0.0237492026, 0.00...",1,10.8_77,1,1.0,5


In [29]:
def remove_columns(df_test):
    return df_test.drop(columns=train['drop_columns_winner'])

def get_meta_features(x_test, catboost, xgboost):
    
    y_score = catboost.predict_proba(x_test)
    predictions_test = y_score[:,1]
    
    y_score = xgboost.predict_proba(x_test)
    predictions_test = np.vstack((predictions_test, y_score[:, 1]))
    
    return predictions_test.T

def get_prob(meta_features, model):
    return model.predict_proba(meta_features)[:, 1]

def compute_winner(df_evaluate, models):
    
    df_evaluate = remove_columns(df_evaluate)
    
    meta_features = get_meta_features(df_evaluate, 
                                      models['catboost'], 
                                      models['xgboost'])
    
    return get_prob(meta_features, models['Naive_bayes'])

In [24]:
models = joblib.load(train['ensemble_models'])

In [30]:
compute_winner(df_evaluate, models)

array([0.99998801, 0.99999997, 0.06658789, ..., 0.99978228, 0.99559285,
       0.99999997])