## Load data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

In [2]:
train = pd.read_csv('onetwotrip_challenge_train.csv', index_col=False)
train.shape

(196056, 43)

In [3]:
test = pd.read_csv('onetwotrip_challenge_test.csv', index_col=False)
test.shape

(455011, 37)

In [4]:
target = train.goal1
train = train[test.columns]

In [5]:
full = pd.concat([train, test])
full.shape

(651067, 37)

## Add my features

In [6]:
def months_delation(raw):
    delation = raw.field3 - raw.field2
    if delation < 0:
        return 12 + delation
    return delation

In [7]:
def new_year(raw):
    delation = raw.field3 - raw.field2
    if delation < 0:
        return 1
    return 0

In [8]:
train['months_delation'] = train.apply(months_delation, axis=1)
train['new_year'] = train.apply(new_year, axis=1)

In [9]:
test['months_delation'] = test.apply(months_delation, axis=1)
test['new_year'] = test.apply(new_year, axis=1)

## Add EDA features

In [11]:
unique14 = np.unique(full['field14'])
new_unique14 = np.round(unique14 / 0.14006639 - 0.27860731).astype(int)
dict14 = {unique14[i]: new_unique14[i] for i in range(len(unique14))}
train['field14_new'] = train['field14'].apply(lambda x: dict14[x])
test['field14_new'] = test['field14'].apply(lambda x: dict14[x])

unique1 = np.unique(full['field1'])
new_unique1 = np.round(unique1  / 0.077571 + 0.0765905).astype(int)
dict1 = {unique1[i]: new_unique1[i] for i in range(len(unique1))}
train['field1_new'] = train['field1'].apply(lambda x: dict1[x])
test['field1_new'] = test['field1'].apply(lambda x: dict1[x])

In [12]:
original_columns = train.columns

## Add user statistics

In [32]:
from tqdm import tqdm

In [18]:
train_users_info = train.drop('orderid', axis=1).groupby('userid').agg(['sum', 'mean', 'max', 'min'])

In [33]:
train_users_info_dict = {}

for userid in tqdm(list(train.userid.unique())):
    train_users_info_dict[userid] = {}
    for (n1, n2), value in zip(train_users_info.loc[userid].index, train_users_info.loc[userid]):
        train_users_info_dict[userid][f'user_{n1}_{n2}'] = value

100%|██████████| 113902/113902 [03:13<00:00, 588.61it/s]


In [36]:
test_users_info = test.drop('orderid', axis=1).groupby('userid').agg(['sum', 'mean', 'max', 'min'])

In [37]:
test_users_info_dict = {}

for userid in tqdm(list(test.userid.unique())):
    test_users_info_dict[userid] = {}
    for (n1, n2), value in zip(test_users_info.loc[userid].index, test_users_info.loc[userid]):
        test_users_info_dict[userid][f'user_{n1}_{n2}'] = value

100%|██████████| 264547/264547 [07:02<00:00, 626.63it/s]


In [44]:
new_columns = list(train_users_info_dict['10d654494cbe97bbb25d51ead2600679aff9e097924add09d8066010a0c9adaf'].keys())

In [50]:
for c in tqdm(new_columns):
    train[c] = train.userid.apply(lambda x: train_users_info_dict[x][c])

100%|██████████| 156/156 [00:52<00:00,  3.08it/s]


In [51]:
for c in tqdm(new_columns):
    test[c] = test.userid.apply(lambda x: test_users_info_dict[x][c])

100%|██████████| 156/156 [02:02<00:00,  1.34it/s]


In [52]:
train.shape, test.shape

((196056, 197), (455011, 197))

In [54]:
train.to_csv('train_with_users.csv', index=False)
test.to_csv('test_with_users.csv', index=False)

## New feauture: количество заказов

In [55]:
train_userid = train.userid.value_counts()
test_userid = test.userid.value_counts()

In [56]:
train['userid_orders'] = train.userid.apply(lambda x: train_userid[x])
test['userid_orders'] = test.userid.apply(lambda x: test_userid[x])

In [59]:
original_columns = list(original_columns)
original_columns += ['userid_orders']

## Add statistics features

In [61]:
train.head()

Unnamed: 0,orderid,userid,field0,field1,field2,field3,field4,field5,field6,field7,...,user_new_year_min,user_field14_new_sum,user_field14_new_mean,user_field14_new_max,user_field14_new_min,user_field1_new_sum,user_field1_new_mean,user_field1_new_max,user_field1_new_min,userid_orders
0,0,10d654494cbe97bbb25d51ead2600679aff9e097924add...,0,-0.626508,11,12,1,1,0,1,...,0.0,-5.0,-5.0,-5.0,-5.0,-8.0,-8.0,-8.0,-8.0,1
1,1,4aafc0391f72bbcf60537aece62923baf9ce644b64ac36...,144,-0.393794,5,7,2,0,0,2,...,0.0,-3.0,-1.5,-1.0,-2.0,-10.0,-5.0,-5.0,-5.0,2
2,2,bac8ffef46348f587c8d17137ab01fb24aef21547c647d...,134,-0.548937,2,3,2,0,0,1,...,0.0,-10.0,-5.0,-5.0,-5.0,-15.0,-7.5,-7.0,-8.0,2
3,3,0392247b4b87674aba2c32bf2292b105771a6a376871be...,0,-0.238651,10,11,1,1,3,2,...,0.0,-10.0,-3.333333,-3.0,-4.0,-14.0,-4.666667,-3.0,-6.0,3
4,4,d1aeefef311bbeb4bd84876c8d49421f276674527d5578...,0,-0.704079,8,11,1,1,0,1,...,0.0,-6.0,-6.0,-6.0,-6.0,-9.0,-9.0,-9.0,-9.0,1


In [62]:
description = train[original_columns].describe()

In [63]:
def calculate_smth(original, desc, name):
    features_names = [
        'max',
        'min',
        'mean_std',
        '25',
        '50',
        '75'
    ]
    features = pd.concat([
        desc['max'] - original,
        original - desc['min'],
        original - desc['mean'],
        (original - desc['mean']) / desc['std'],
        (original > desc['25%']).astype(np.float32),
        (original > desc['50%']).astype(np.float32),
        (original > desc['75%']).astype(np.float32),
    ], axis=1, keys=[f'{name}_{features_name}' for features_name in features_names])
    return features

In [64]:
for column in tqdm(description.columns):
    if column == 'orderid':
        continue
    features = calculate_smth(train[column], description[column], column)
    train = pd.concat([train, features], axis=1)

100%|██████████| 41/41 [00:47<00:00,  1.44s/it]


In [65]:
for column in tqdm(description.columns):
    if column == 'orderid':
        continue
    features = calculate_smth(test[column], description[column], column)
    test = pd.concat([test, features], axis=1)

100%|██████████| 41/41 [01:55<00:00,  3.35s/it]


In [66]:
train.shape

(196056, 438)

In [67]:
test.shape

(455011, 438)

In [68]:
train.to_csv('train_fullest.csv', index=False)
test.to_csv('test_fullest.csv', index=False)

## Lgb cv

In [69]:
import lightgbm as lgb
import numpy as np

In [70]:
parameters = {
    'n_estimators': 800,
    'num_leaves':6,
    'learning_rate': 0.1,
    'objective': 'binary',
    'metric': 'auc'
}

In [71]:
w = lgb.cv(parameters, 
           lgb.Dataset(train.drop(['userid', 'orderid'], axis=1), label=target),
           stratified=False,
           num_boost_round=800,
           nfold=4,
           verbose_eval=100)



[100]	cv_agg's auc: 0.693788 + 0.00905307
[200]	cv_agg's auc: 0.698901 + 0.00955371
[300]	cv_agg's auc: 0.700666 + 0.010541
[400]	cv_agg's auc: 0.701058 + 0.0115304
[500]	cv_agg's auc: 0.700743 + 0.0110684
[600]	cv_agg's auc: 0.700132 + 0.0111438
[700]	cv_agg's auc: 0.698685 + 0.0113808
[800]	cv_agg's auc: 0.697757 + 0.010284


In [72]:
# parameters = {
#     'n_estimators': 1000,
#     'num_leaves':6,
#     'learning_rate': 0.1,
#     'objective': 'binary',
#     'metric': 'auc'
# }

best_n_estimators = np.argmax(w['auc-mean'])
best_n_estimators, w['auc-mean'][best_n_estimators]

(333, 0.7015319570036302)

## Predict

In [81]:
classifier = LGBMClassifier(n_estimators=best_n_estimators, num_leaves=6)
classifier.fit(train.drop(['orderid', 'userid'], axis=1), target)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=333, n_jobs=-1, num_leaves=6, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [82]:
prediction = classifier.predict_proba(test.drop(['orderid', 'userid'], axis=1))
test['proba'] = prediction[:, 1]
baseline = test[['orderid', 'proba']]
baseline.to_csv('before_feature_selection_user_features.csv', index=False)

## Feature selection

In [73]:
from sklearn.feature_selection import SelectFromModel
from lightgbm import LGBMClassifier

In [74]:
classifier = LGBMClassifier(n_estimators=best_n_estimators, num_leaves=6)

embeded_lgb_selector = SelectFromModel(classifier, max_features=150)
embeded_lgb_selector.fit(train.drop(['userid', 'orderid'], axis=1), target)

SelectFromModel(estimator=LGBMClassifier(boosting_type='gbdt',
                                         class_weight=None,
                                         colsample_bytree=1.0,
                                         importance_type='split',
                                         learning_rate=0.1, max_depth=-1,
                                         min_child_samples=20,
                                         min_child_weight=0.001,
                                         min_split_gain=0.0, n_estimators=333,
                                         n_jobs=-1, num_leaves=6,
                                         objective=None, random_state=None,
                                         reg_alpha=0.0, reg_lambda=0.0,
                                         silent=True, subsample=1.0,
                                         subsample_for_bin=200000,
                                         subsample_freq=0),
                max_features=150, norm_order=1, prefit=Fa

In [75]:
embeded_lgb_support = embeded_lgb_selector.get_support()
embeded_lgb_feature = train.drop(['userid', 'orderid'], axis=1).loc[:,embeded_lgb_support].columns.tolist()

In [76]:
print(str(len(embeded_lgb_feature)), 'selected features')

122 selected features


In [78]:
embeded_lgb_feature

['field0',
 'field1',
 'field2',
 'field3',
 'field4',
 'field6',
 'field10',
 'field11',
 'field12',
 'field13',
 'field14',
 'field16',
 'field17',
 'field18',
 'field21',
 'field22',
 'field23',
 'field27',
 'field28',
 'indicator_goal22',
 'indicator_goal24',
 'user_field0_sum',
 'user_field0_mean',
 'user_field0_max',
 'user_field1_sum',
 'user_field1_mean',
 'user_field1_max',
 'user_field1_min',
 'user_field2_sum',
 'user_field2_mean',
 'user_field2_max',
 'user_field3_sum',
 'user_field3_mean',
 'user_field3_max',
 'user_field3_min',
 'user_field5_mean',
 'user_field6_sum',
 'user_field6_mean',
 'user_field6_max',
 'user_field6_min',
 'user_field7_mean',
 'user_field8_mean',
 'user_field9_sum',
 'user_field9_mean',
 'user_field10_sum',
 'user_field10_mean',
 'user_field11_sum',
 'user_field11_mean',
 'user_field11_max',
 'user_field11_min',
 'user_field12_sum',
 'user_field12_mean',
 'user_field12_max',
 'user_field12_min',
 'user_field13_sum',
 'user_field13_mean',
 'user_fiel

## Lgb cv on selected features

In [83]:
parameters = {
    'n_estimators': 800,
    'num_leaves':6,
    'learning_rate': 0.1,
    'objective': 'binary',
    'metric': 'auc'
}

In [84]:
w_selected = lgb.cv(parameters, 
           lgb.Dataset(train[embeded_lgb_feature], label=target),
           stratified=False,
           num_boost_round=800,
           nfold=4,
           verbose_eval=100)

[100]	cv_agg's auc: 0.694517 + 0.00966461
[200]	cv_agg's auc: 0.698491 + 0.00937874
[300]	cv_agg's auc: 0.700318 + 0.00933857
[400]	cv_agg's auc: 0.70023 + 0.0107522
[500]	cv_agg's auc: 0.700115 + 0.0107216
[600]	cv_agg's auc: 0.698535 + 0.0108983
[700]	cv_agg's auc: 0.697864 + 0.0103237
[800]	cv_agg's auc: 0.695648 + 0.0101573


In [85]:
best_n_estimators_selected = np.argmax(w_selected['auc-mean'])
best_n_estimators_selected, w_selected['auc-mean'][best_n_estimators_selected]

(362, 0.7005269315091297)

## Predict

In [86]:
classifier = LGBMClassifier(n_estimators=best_n_estimators_selected, num_leaves=6)
classifier.fit(train[embeded_lgb_feature], target)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=362, n_jobs=-1, num_leaves=6, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [87]:
prediction = classifier.predict_proba(test[embeded_lgb_feature])
test['proba'] = prediction[:, 1]
baseline = test[['orderid', 'proba']]
baseline.to_csv('after_feature_selection_user_features.csv', index=False)

In [91]:
test['proba_lgb'] = test['proba']

## RF

In [88]:
from sklearn.ensemble import RandomForestClassifier

In [89]:
rf_classifier = RandomForestClassifier(1000)
rf_classifier.fit(train[embeded_lgb_feature], target)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [90]:
prediction = rf_classifier.predict_proba(test[embeded_lgb_feature])
test['proba_rf'] = prediction[:, 1]

## LogReg

In [100]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

In [104]:
clf = LogisticRegressionCV(cv=3, random_state=0, scoring='roc_auc', verbose=2).fit(train[embeded_lgb_feature], target)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   50.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.5min finished


In [106]:
clf = LogisticRegression().fit(train[embeded_lgb_feature], target)



In [107]:
prediction = clf.predict_proba(test[embeded_lgb_feature])
test['proba_logreg'] = prediction[:, 1]

In [110]:
from sklearn.metrics.pairwise import cosine_similarity

In [112]:
cosine_similarity([test.proba_lgb], [test.proba_rf])

array([[0.78129844]])

In [113]:
cosine_similarity([test.proba_lgb], [test.proba_logreg])

array([[0.77777042]])

In [114]:
cosine_similarity([test.proba_rf], [test.proba_logreg])

array([[0.73088277]])

## Random blend

In [118]:
lgb_coeff = 0.9
rf_coeff = 0.07
logreg_coeff = 0.03

assert lgb_coeff + rf_coeff + logreg_coeff

In [119]:
test['proba'] = lgb_coeff * test['proba_lgb'] + rf_coeff * test['proba_rf'] + logreg_coeff * test['proba_logreg']
baseline = test[['orderid', 'proba']]
baseline.to_csv('after_feature_selection_user_features_rf_logreg2.csv', index=False)