In [1]:
import gc
import json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm_notebook
from glob import glob

from utils import loadpkl, to_json, scalingPredictions, getBestMultiple, read_pickles

%matplotlib inline



In [2]:
# search a best weight for 2 predictions
def getBestWeights(act, pred_lgbm, pred_xgb):
    search_range = np.arange(0.0, 1.1, 0.1)
    best_weights = []

    # base prediction
    _pred = pd.DataFrame()
    cols_pred=[]
    for i in range(0,12):
        _pred['pred_{}'.format(i)] = 0.5*pred_lgbm['pred_lgbm_plans{}'.format(i)]+ 0.5*pred_xgb['pred_xgb_plans{}'.format(i)]
        cols_pred.append('pred_{}'.format(i))

    # base score
    best_f1=f1_score(act, np.argmax(_pred[cols_pred].values,axis=1),average='weighted')
    
    # get best weights for each classes
    for i in range(0,12):
        f1s = []
        for _w in search_range:
            tmp_pred = _pred[cols_pred]
            tmp_pred['pred_{}'.format(i)] = _w*pred_lgbm['pred_lgbm_plans{}'.format(i)]+ (1.0-_w)*pred_xgb['pred_xgb_plans{}'.format(i)]
                    
            # calc f1 score
            _f1 = f1_score(act, np.argmax(tmp_pred.values,axis=1),average='weighted')
            f1s.append(_f1)
            print('class: {}, w: {}, f1 score: {}'.format(i,_w,_f1))
            if _f1 > best_f1:
                best_f1 = _f1
                best_w = _w

            del tmp_pred

        # save weights & predicted values
        best_weights.append(best_w)
        _pred['pred_{}'.format(i)] = best_w*pred_lgbm['pred_lgbm_plans{}'.format(i)]+ (1.0-best_w)*pred_xgb['pred_xgb_plans{}'.format(i)]
        
        # plot thresholds
        plt.figure()
        plt.plot(search_range, f1s)
        plt.savefig('../imp/multiple{}.png'.format(i))

    return best_weights

In [3]:
# load predictions
pred_lgbm1 = loadpkl('../features/lgbm_pred_1.pkl')
pred_lgbm2 = loadpkl('../features/lgbm_pred_2.pkl')
pred_lgbm3 = loadpkl('../features/lgbm_pred_3.pkl')

In [7]:
sub1 = pd.read_csv('../output/submission_lgbm_1.csv')
sub2 = pd.read_csv('../output/submission_lgbm_2.csv')
sub3 = pd.read_csv('../output/submission_lgbm_3.csv')

In [4]:
plans = read_pickles('../features/plans')

100%|██████████| 5/5 [01:01<00:00, 10.58s/it]


In [5]:
cols_transport_mode = ['plan_{}_transport_mode'.format(i) for i in range(0,7)]
cols_drop = [c for c in plans.columns if c not in cols_transport_mode+['sid','plan_num_plans']]
plans.drop(cols_drop,axis=1,inplace=True)

In [7]:
sub.to_csv('../output/submission_split.csv',index=False)

In [3]:
pred_lgbm1 = loadpkl('../features/lgbm_pred_1.pkl')

In [24]:
cols_transport_mode = ['plan_{}_transport_mode'.format(i) for i in range(0,7)]
plans = plans[cols_transport_mode+['sid','plan_num_plans']]

In [9]:
sub1.recommend_mode.value_counts()

2     34083
7     22351
1     17719
5     10906
10     3403
9      1855
3       708
0       454
11      410
6       338
8       264
4        80
Name: recommend_mode, dtype: int64

In [19]:
cols_pred_lgbm = ['pred_lgbm_plans{}'.format(i) for i in range(0,12)]
pred_lgbm1['recommend_mode'] = np.argmax(pred_lgbm1[cols_pred_lgbm].values,axis=1)
pred_lgbm2['recommend_mode'] = np.argmax(pred_lgbm2[cols_pred_lgbm].values,axis=1)
pred_lgbm3['recommend_mode'] = np.argmax(pred_lgbm3[cols_pred_lgbm].values,axis=1)

In [21]:
sub_pred1 = pred_lgbm1[pred_lgbm1['click_mode'].isnull()]
sub_pred2 = pred_lgbm2[pred_lgbm2['click_mode'].isnull()]
sub_pred3 = pred_lgbm3[pred_lgbm3['click_mode'].isnull()]

oof_pred1 = pred_lgbm1[pred_lgbm1['click_mode'].notnull()]
oof_pred2 = pred_lgbm2[pred_lgbm2['click_mode'].notnull()]
oof_pred3 = pred_lgbm3[pred_lgbm3['click_mode'].notnull()]

In [None]:
for i in range()

In [29]:
sub_pred1.recommend_mode.value_counts(normalize=True)

2     0.368182
7     0.241447
1     0.191410
5     0.117812
10    0.036761
9     0.020039
3     0.007648
0     0.004904
11    0.004429
6     0.003651
8     0.002852
4     0.000864
Name: recommend_mode, dtype: float64

In [25]:
oof_pred1.click_mode.value_counts(normalize=True)

2.0     0.277955
7.0     0.159268
1.0     0.143302
9.0     0.099508
5.0     0.096690
0.0     0.076810
3.0     0.050149
10.0    0.030306
4.0     0.025671
6.0     0.024158
11.0    0.012400
8.0     0.003782
Name: click_mode, dtype: float64

In [24]:
sub_pred2.recommend_mode.value_counts(normalize=True)

2     0.251117
1     0.178749
7     0.163378
9     0.152836
5     0.125324
0     0.076117
11    0.025545
10    0.019688
6     0.004334
8     0.001464
3     0.001355
4     0.000092
Name: recommend_mode, dtype: float64

In [27]:
oof_pred2.click_mode.value_counts(normalize=True)

2.0     0.209000
0.0     0.192722
1.0     0.153104
7.0     0.136463
9.0     0.097632
5.0     0.096585
3.0     0.035083
10.0    0.023180
4.0     0.018289
6.0     0.017722
11.0    0.017035
8.0     0.003185
Name: click_mode, dtype: float64