In [1]:
import gc
import json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm_notebook
from glob import glob

from utils import loadpkl, to_json, scalingPredictions, getBestMultiple, read_pickles

%matplotlib inline



In [2]:
# search a best weight for 2 predictions
def getBestWeights(act, pred_lgbm, pred_xgb):
    search_range = np.arange(0.0, 1.1, 0.1)
    best_weights = []

    # base prediction
    _pred = pd.DataFrame()
    cols_pred=[]
    for i in range(0,12):
        _pred['pred_{}'.format(i)] = 0.5*pred_lgbm['pred_lgbm_plans{}'.format(i)]+ 0.5*pred_xgb['pred_xgb_plans{}'.format(i)]
        cols_pred.append('pred_{}'.format(i))

    # base score
    best_f1=f1_score(act, np.argmax(_pred[cols_pred].values,axis=1),average='weighted')
    
    # get best weights for each classes
    for i in range(0,12):
        f1s = []
        for _w in search_range:
            tmp_pred = _pred[cols_pred]
            tmp_pred['pred_{}'.format(i)] = _w*pred_lgbm['pred_lgbm_plans{}'.format(i)]+ (1.0-_w)*pred_xgb['pred_xgb_plans{}'.format(i)]
                    
            # calc f1 score
            _f1 = f1_score(act, np.argmax(tmp_pred.values,axis=1),average='weighted')
            f1s.append(_f1)
            print('class: {}, w: {}, f1 score: {}'.format(i,_w,_f1))
            if _f1 > best_f1:
                best_f1 = _f1
                best_w = _w

            del tmp_pred

        # save weights & predicted values
        best_weights.append(best_w)
        _pred['pred_{}'.format(i)] = best_w*pred_lgbm['pred_lgbm_plans{}'.format(i)]+ (1.0-best_w)*pred_xgb['pred_xgb_plans{}'.format(i)]
        
        # plot thresholds
        plt.figure()
        plt.plot(search_range, f1s)
        plt.savefig('../imp/multiple{}.png'.format(i))

    return best_weights

In [3]:
# load predictions
pred_lgbm1 = loadpkl('../features/lgbm_pred_1.pkl')
pred_lgbm2 = loadpkl('../features/lgbm_pred_2.pkl')
pred_lgbm3 = loadpkl('../features/lgbm_pred_3.pkl')

In [7]:
sub1 = pd.read_csv('../output/submission_lgbm_1.csv')
sub2 = pd.read_csv('../output/submission_lgbm_2.csv')
sub3 = pd.read_csv('../output/submission_lgbm_3.csv')

In [3]:
plans = read_pickles('../features/plans')

100%|██████████| 5/5 [00:49<00:00,  8.79s/it]


In [5]:
cols_transport_mode = ['plan_{}_transport_mode'.format(i) for i in range(0,7)]
cols_drop = [c for c in plans.columns if c not in cols_transport_mode+['sid','plan_num_plans']]
plans.drop(cols_drop,axis=1,inplace=True)

In [7]:
sub.to_csv('../output/submission_split.csv',index=False)

In [3]:
pred_lgbm1 = loadpkl('../features/lgbm_pred_1.pkl')

In [24]:
cols_transport_mode = ['plan_{}_transport_mode'.format(i) for i in range(0,7)]
plans = plans[cols_transport_mode+['sid','plan_num_plans']]

In [9]:
sub1.recommend_mode.value_counts()

2     34083
7     22351
1     17719
5     10906
10     3403
9      1855
3       708
0       454
11      410
6       338
8       264
4        80
Name: recommend_mode, dtype: int64

In [4]:
cols_pred_lgbm = ['pred_lgbm_plans{}'.format(i) for i in range(0,12)]
pred_lgbm1['recommend_mode'] = np.argmax(pred_lgbm1[cols_pred_lgbm].values,axis=1)
pred_lgbm2['recommend_mode'] = np.argmax(pred_lgbm2[cols_pred_lgbm].values,axis=1)
pred_lgbm3['recommend_mode'] = np.argmax(pred_lgbm3[cols_pred_lgbm].values,axis=1)

In [5]:
sub_pred1 = pred_lgbm1[pred_lgbm1['click_mode'].isnull()]
sub_pred2 = pred_lgbm2[pred_lgbm2['click_mode'].isnull()]
sub_pred3 = pred_lgbm3[pred_lgbm3['click_mode'].isnull()]

oof_pred1 = pred_lgbm1[pred_lgbm1['click_mode'].notnull()]
oof_pred2 = pred_lgbm2[pred_lgbm2['click_mode'].notnull()]
oof_pred3 = pred_lgbm3[pred_lgbm3['click_mode'].notnull()]

In [6]:
for i in range(0,12):
    ratio_sub = sub_pred1.recommend_mode.value_counts(normalize=True)[i]
    ratio_oof = oof_pred1.click_mode.value_counts(normalize=True)[i]
    print(i,ratio_sub/ratio_oof)

0 0.063990726126383
1 1.3349560145065842
2 1.324571341204978
3 0.15703192370231872
4 0.03071848939656307
5 1.2176717769134733
6 0.14979750312249762
7 1.5163906282963855
8 0.782695716240381
9 0.20235371894619733
10 1.2069240776756962
11 0.3554417896001417


In [7]:
for i in range(0,12):
    ratio_sub = sub_pred2.recommend_mode.value_counts(normalize=True)[i]
    ratio_oof = oof_pred2.click_mode.value_counts(normalize=True)[i]
    print(i,ratio_sub/ratio_oof)

0 0.39855911118862797
1 1.166683828612525
2 1.2021950971910904
3 0.03672914487225109
4 0.006405103286199936
5 1.2964318764186649
6 0.24220485907227046
7 1.1991329839004448
8 0.43080768546812165
9 1.5628525497750336
10 0.8367224795372099
11 1.4911893721856286


In [8]:
for i in range(0,12):
    ratio_sub = sub_pred3.recommend_mode.value_counts(normalize=True)[i]
    ratio_oof = oof_pred3.click_mode.value_counts(normalize=True)[i]
    print(i,ratio_sub/ratio_oof)

0 0.0498416539348544
1 1.305849477561449
2 1.223004398543981
3 0.12606549756775765
4 0.046083822668905754
5 1.2152064106324363
6 0.42038478534676715
7 1.2840782294225923
8 1.17269616769814
9 1.0083590371648832
10 0.8934677657144404
11 1.0487901553343681


In [6]:
plans

Unnamed: 0,sid,plan_0_distance,plan_1_distance,plan_2_distance,plan_3_distance,plan_4_distance,plan_5_distance,plan_6_distance,plan_7_distance,plan_0_price,...,plan_6_transport_mode_target_2,plan_6_transport_mode_target_3,plan_6_transport_mode_target_4,plan_6_transport_mode_target_5,plan_6_transport_mode_target_6,plan_6_transport_mode_target_7,plan_6_transport_mode_target_8,plan_6_transport_mode_target_9,plan_6_transport_mode_target_10,plan_6_transport_mode_target_11
0,149233,2100,2577.0,2577.0,1720.0,1645.0,1729.0,,,300,...,,,,,,,,,,
1,337156,8887,1.0,8603.0,,,,,,400,...,,,,,,,,,,
2,398930,8471,8471.0,8458.0,21395.0,17719.0,,,,2300,...,,,,,,,,,,
3,196610,32405,33500.0,33500.0,28931.0,31790.0,,,,2500,...,,,,,,,,,,
4,302267,38511,42427.0,42427.0,39193.0,39199.0,36593.0,,,700,...,,,,,,,,,,
5,232804,37442,35708.0,35708.0,38028.0,,,,,1000,...,,,,,,,,,,
6,59988,11379,13505.0,12865.0,,,,,,400,...,,,,,,,,,,
7,21640,34492,32931.0,32931.0,34136.0,,,,,700,...,,,,,,,,,,
8,406712,25318,26903.0,26903.0,25588.0,,,,,600,...,,,,,,,,,,
9,478890,27621,27621.0,27597.0,27701.0,,,,,8900,...,,,,,,,,,,


In [17]:
pred_lgbm3.recommend_mode.isnull().sum()

0

In [18]:
pred = pred_lgbm1.append(pred_lgbm2)
pred = pred.append(pred_lgbm3)

In [21]:
pred.sid.value_counts()

2047       2
125591     2
80525      2
74382      2
76431      2
119440     2
121489     2
115346     2
117395     2
127636     2
129685     2
123542     2
103064     2
29348      2
105113     2
98970      2
101019     2
111260     2
113309     2
107166     2
109215     2
21152      2
23201      2
17058      2
78476      2
68235      2
66186      2
72329      2
455284     2
457333     2
          ..
140561     2
146706     2
144659     2
134420     2
1004735    2
1000637    2
2185598    2
1002684    2
2183551    2
1027232    2
1025185    2
1031330    2
1029283    2
1019044    2
1016997    2
1023142    2
1021095    2
1043624    2
1041577    2
1047722    2
1045675    2
1035436    2
1033389    2
1039534    2
1037487    2
1010872    2
1008825    2
1014970    2
1012923    2
0          2
Name: sid, Length: 1659740, dtype: int64

In [22]:
plans.sid.value_counts()

2047       1
2254566    1
2242284    1
2232043    1
2229994    1
2236137    1
2234088    1
2256615    1
2260709    1
1538116    1
2258660    1
2248419    1
2246370    1
2252513    1
2250464    1
2303694    1
2244333    1
2238190    1
2240239    1
2283248    1
2285297    1
2279154    1
2281203    1
2291444    1
2293493    1
2287350    1
2289399    1
2266872    1
2268921    1
2262778    1
          ..
111971     1
101732     1
99685      1
105830     1
103783     1
126312     1
124265     1
130410     1
128363     1
118124     1
116077     1
122222     1
120175     1
75121      1
437632     1
81266      1
79219      1
68980      1
66933      1
73078      1
71031      1
93560      1
91513      1
97658      1
95611      1
85372      1
83325      1
89470      1
87423      1
0          1
Name: sid, Length: 2235055, dtype: int64