In [1]:
import gc
import json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm_notebook
from glob import glob

from utils import loadpkl, to_json, scalingPredictions, getBestMultiple, read_pickles

%matplotlib inline



In [2]:
# search a best weight for 2 predictions
def getBestWeights(act, pred_lgbm, pred_xgb):
    search_range = np.arange(0.0, 1.1, 0.1)
    best_weights = []

    # base prediction
    _pred = pd.DataFrame()
    cols_pred=[]
    for i in range(0,12):
        _pred['pred_{}'.format(i)] = 0.5*pred_lgbm['pred_lgbm_plans{}'.format(i)]+ 0.5*pred_xgb['pred_xgb_plans{}'.format(i)]
        cols_pred.append('pred_{}'.format(i))

    # base score
    best_f1=f1_score(act, np.argmax(_pred[cols_pred].values,axis=1),average='weighted')
    
    # get best weights for each classes
    for i in range(0,12):
        f1s = []
        for _w in search_range:
            tmp_pred = _pred[cols_pred]
            tmp_pred['pred_{}'.format(i)] = _w*pred_lgbm['pred_lgbm_plans{}'.format(i)]+ (1.0-_w)*pred_xgb['pred_xgb_plans{}'.format(i)]
                    
            # calc f1 score
            _f1 = f1_score(act, np.argmax(tmp_pred.values,axis=1),average='weighted')
            f1s.append(_f1)
            print('class: {}, w: {}, f1 score: {}'.format(i,_w,_f1))
            if _f1 > best_f1:
                best_f1 = _f1
                best_w = _w

            del tmp_pred

        # save weights & predicted values
        best_weights.append(best_w)
        _pred['pred_{}'.format(i)] = best_w*pred_lgbm['pred_lgbm_plans{}'.format(i)]+ (1.0-best_w)*pred_xgb['pred_xgb_plans{}'.format(i)]
        
        # plot thresholds
        plt.figure()
        plt.plot(search_range, f1s)
        plt.savefig('../imp/multiple{}.png'.format(i))

    return best_weights

In [3]:
# load predictions
pred_lgbm1 = loadpkl('../features/lgbm_pred_1.pkl')
pred_lgbm2 = loadpkl('../features/lgbm_pred_2.pkl')
pred_lgbm3 = loadpkl('../features/lgbm_pred_3.pkl')

In [7]:
sub1 = pd.read_csv('../output/submission_lgbm_1.csv')
sub2 = pd.read_csv('../output/submission_lgbm_2.csv')
sub3 = pd.read_csv('../output/submission_lgbm_3.csv')

In [4]:
plans = read_pickles('../features/plans')

100%|██████████| 5/5 [01:01<00:00, 10.58s/it]


In [5]:
cols_transport_mode = ['plan_{}_transport_mode'.format(i) for i in range(0,7)]
cols_drop = [c for c in plans.columns if c not in cols_transport_mode+['sid','plan_num_plans']]
plans.drop(cols_drop,axis=1,inplace=True)

In [7]:
sub.to_csv('../output/submission_split.csv',index=False)

In [3]:
pred_lgbm1 = loadpkl('../features/lgbm_pred_1.pkl')

In [24]:
cols_transport_mode = ['plan_{}_transport_mode'.format(i) for i in range(0,7)]
plans = plans[cols_transport_mode+['sid','plan_num_plans']]

In [9]:
sub1.recommend_mode.value_counts()

2     34083
7     22351
1     17719
5     10906
10     3403
9      1855
3       708
0       454
11      410
6       338
8       264
4        80
Name: recommend_mode, dtype: int64

In [19]:
cols_pred_lgbm = ['pred_lgbm_plans{}'.format(i) for i in range(0,12)]
pred_lgbm1['recommend_mode'] = np.argmax(pred_lgbm1[cols_pred_lgbm].values,axis=1)
pred_lgbm2['recommend_mode'] = np.argmax(pred_lgbm2[cols_pred_lgbm].values,axis=1)
pred_lgbm3['recommend_mode'] = np.argmax(pred_lgbm3[cols_pred_lgbm].values,axis=1)

In [21]:
sub_pred1 = pred_lgbm1[pred_lgbm1['click_mode'].isnull()]
sub_pred2 = pred_lgbm2[pred_lgbm2['click_mode'].isnull()]
sub_pred3 = pred_lgbm3[pred_lgbm3['click_mode'].isnull()]

oof_pred1 = pred_lgbm1[pred_lgbm1['click_mode'].notnull()]
oof_pred2 = pred_lgbm2[pred_lgbm2['click_mode'].notnull()]
oof_pred3 = pred_lgbm3[pred_lgbm3['click_mode'].notnull()]

In [30]:
for i in range(0,12):
    ratio_sub = sub_pred1.recommend_mode.value_counts(normalize=True)[i]
    ratio_oof = oof_pred1.click_mode.value_counts(normalize=True)[i]
    print(i,ratio_sub/ratio_oof)

0 0.06385008716786347
1 1.3357098436412087
2 1.324610205454177
3 0.15250837034463877
4 0.03366409796883624
5 1.2184538397117477
6 0.151138973299714
7 1.515983671022611
8 0.754130179151316
9 0.20137668918733698
10 1.2129836492411088
11 0.35718415131386794


In [31]:
for i in range(0,12):
    ratio_sub = sub_pred2.recommend_mode.value_counts(normalize=True)[i]
    ratio_oof = oof_pred2.click_mode.value_counts(normalize=True)[i]
    print(i,ratio_sub/ratio_oof)

0 0.39495558109836043
1 1.1675035895938435
2 1.201514511202471
3 0.03863715239808232
4 0.005032581153442807
5 1.2975580785164558
6 0.24456553021332572
7 1.1972322259876302
8 0.4597033229080566
9 1.5654235947680832
10 0.8493563392368658
11 1.4995392467993163


In [32]:
for i in range(0,12):
    ratio_sub = sub_pred3.recommend_mode.value_counts(normalize=True)[i]
    ratio_oof = oof_pred3.click_mode.value_counts(normalize=True)[i]
    print(i,ratio_sub/ratio_oof)

0 0.2931026446931892
1 1.2296230513618707
2 1.299727112585318
3 0.10804290399598389
4 0.01814112368752401
5 1.2656809074453792
6 0.19139596882264776
7 1.3722277994121381
8 0.542836781214525
9 0.9652952595379859
10 1.0560355179624799
11 1.060541335270807
