In [8]:
import pandas as pd
import numpy as np

import yaml

In [9]:
config_path = "../config/params.yaml"
config = yaml.load(open(config_path), Loader=yaml.FullLoader)

preproc = config["preprocessing"]
train = config['train']
evaluate = config['evaluate']

# Evaluate

In [46]:
df_train = pd.read_csv(preproc['train_data'])
df_train = df_train.set_index('index')
df_train[:5]

Unnamed: 0_level_0,purchase,forsmallbiz,price,customer,supplier,is_winner,vectorized_tokens,month,reg_code,purchase_size,flag_won,n_unique_okpd2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,3,1,290000.0,2,1,1,[ 0.18815488 0.1963165 0.08348706 0.132998...,2,58.2_2,1,0.0,4
1,2834,1,105000.0,218,1,1,[ 1.88778124e-01 1.99460707e-01 8.44090815e-...,2,62.0_2,1,0.0,4
2,154949,1,98967.5,11235,1,1,[ 0.17555872 0.0838882 0.01939559 0.047119...,2,63.9_2,1,0.0,4
3,147950,0,77460.03,11061,1,1,[ 2.09548737e-01 1.98348963e-01 3.00821184e-...,3,62.0_2,1,0.0,4
4,165990,0,138000.0,11558,1,1,[ 1.55703006e-01 1.47389050e-01 4.36386056e-...,3,62.0_2,1,0.0,4


In [47]:
df_test = pd.read_csv(evaluate['test_evaluate'])
df_test = df_test.set_index('index')
df_test[:5]

Unnamed: 0_level_0,purchase,region_code,min_publish_date,forsmallbiz,price,customer,okpd2_code,supplier,vectorized_tokens
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
3730,1064,2,2022-11-15,1,181720.0,70,58.2,1,[ 1.77900028e-01 8.02768195e-02 1.25833983e-...
5492,1729,2,2022-12-06,1,167448.0,105,62.0,1,[ 2.70711475e-01 8.42926477e-02 4.42017172e-...
7221,2430,2,2022-12-08,1,200248.16,201,62.0,1,[ 2.59523023e-01 1.20019309e-01 7.98608701e-...
462973,156898,2,2022-12-12,0,190740.0,11266,62.0,1,[ 0.20131386 0.07292083 -0.01833528 -0.017387...
638676,412017,77,2022-11-07,1,2886156.0,9651,27.4,100009,[ 5.40626203e-02 7.06050537e-02 -3.14055514e-...


In [48]:
def extract_words(string):
    return list(map(float, string[1:-1].split()))


def get_month(df_test):
    return df_test['min_publish_date'].apply(lambda x: int(x.split('-')[1]))


def add_reg_code(df_test):
    return df_test['okpd2_code'].astype('str') + '_' \
                 + df_test['region_code'].astype('str')


def add_purchase_size(df_test):
    return df_test.merge(df_test.groupby('purchase')['supplier'] \
                                .size().to_frame('purchase_size'), 
                         on='purchase', how='outer')

    
def add_flag(df_test, df_train):
    return df_test.merge(df_train[['supplier', 'customer', 'flag_won']] \
                                 .groupby(['supplier', 'customer']).tail(1), 
                         on=['supplier', 'customer'], how='left').fillna(0)


def add_unique_okpd2(df_test, df_train):
    return df_test.merge(df_train[['supplier', 'n_unique_okpd2']] \
                                 .groupby('supplier').tail(1), 
                         on=['supplier'], how='left').fillna(1)


def generate_features(df_test, df_train):
    df_test['month'] = get_month(df_test)
    df_test['reg_code'] = add_reg_code(df_test)
    df_test = add_purchase_size(df_test)
    df_test = add_flag(df_test, df_train)
    df_test = add_unique_okpd2(df_test, df_train)
    
    return df_test 

In [49]:
def pipeline_preproc(df_test, df_train):
    
    df_train['vectorized_tokens'] = df_train['vectorized_tokens'].apply(extract_words)
    df_test['vectorized_tokens'] = df_test['vectorized_tokens'].apply(extract_words)

    df_test = generate_features(df_test, df_train)

    df_test = df_test.drop(columns=preproc['drop_columns'])

    df_train = df_train.astype(preproc['change_type_columns'])
    df_test = df_test.astype(preproc['change_type_columns'])

    return df_test, df_train

In [50]:
df_test, df_train = pipeline_preproc(df_test, df_train)

In [51]:
df_test[:5]

Unnamed: 0,purchase,forsmallbiz,price,customer,supplier,vectorized_tokens,month,reg_code,purchase_size,flag_won,n_unique_okpd2
0,1064,1,181720.0,70,1,"[0.177900028, 0.0802768195, 0.125833983, 0.188...",11,58.2_2,1,0.0,4.0
1,1729,1,167448.0,105,1,"[0.270711475, 0.0842926477, 0.0442017172, 0.07...",12,62.0_2,1,0.0,4.0
2,2430,1,200248.16,201,1,"[0.259523023, 0.120019309, 0.0798608701, 0.101...",12,62.0_2,1,0.0,4.0
3,156898,0,190740.0,11266,1,"[0.20131386, 0.07292083, -0.01833528, -0.01738...",12,62.0_2,1,0.0,4.0
4,412017,1,2886156.0,9651,100009,"[0.0540626203, 0.0706050537, -0.0314055514, -0...",11,27.4_77,1,0.0,1.0
