#### time window

In [1]:
import lightgbm as lgb
import numpy as np
import pandas as pd

data = pd.read_csv('../input/train_sample.csv')
data['datetime'] = data['time'].apply(lambda x: str(x).split(' ')[0])

In [2]:
data.shape

(993712, 34)

In [3]:
# rate 1
posi_f = [
    'voice_connection', 'wifi_connection',
    'voice_convert_1', 'convert_rate',
    'rrc_connection', 'erab_connection',
    'esrvcc_convert'
]

# rate 0
navg_f = [
    'voice_disconnection', 'wifi_disconnection',
    'wifi_disconnection_1',
    'erab_trash', 'prb_pull', 'prb_push'
]

# count 
count_f = [
    'voice_pull_delay','voice_count', 'data_count', 
    'rrc_max', 'csgb_rrc', 'rrc_2g', 'rrc_3g', 'rrc_num',
    'voice_push_miss', 'voice_pull_miss'
]

# drop
drop_f = [
    'video_connection', 'video_disconnection', 'voice_convert_2', 'voice_convert_2', 'pdcch_cce'
]

In [4]:
num_f = posi_f + navg_f + count_f

cat_f = [
    'ENODEB_ID', 'CID', 'hour'
]

In [5]:
def format_(dataframe, feature, fillna='0.0', astype=np.float32, normalize=True):
    dataframe[feature] = dataframe[feature].fillna(fillna)
    dataframe[feature] = dataframe[feature].astype(np.float32)
    if normalize:
        dataframe.loc[dataframe[feature]>1, feature] = 1
        dataframe.loc[dataframe[feature]<0, feature] = 0
    return dataframe

In [6]:
from sklearn.preprocessing import MinMaxScaler
def normalized_feature(dataframe, feature):
    mms = MinMaxScaler()
    return mms.fit_transform(dataframe[feature].values.reshape(-1, 1))

In [7]:
def creat_feature(dataframe, datetime):
    tmp = dataframe[dataframe.loc[:,('datetime')]==datetime]
    tmp = tmp.drop(['datetime'], axis=1)
    
    for feature in posi_f:
        tmp = format_(tmp, feature, fillna=tmp[feature].mean())
    for feature in navg_f:
        tmp = format_(tmp, feature, fillna=tmp[feature].mean())
    for feature in count_f:
        tmp = format_(tmp, feature, fillna=tmp[feature].mean(), normalize=False)
        tmp[feature] = normalized_feature(tmp, feature)
    tmp = tmp.drop(drop_f, axis=1)

    tmp.mr_low = tmp.mr_low.astype(np.float32)
    tmp.mr_high = tmp.mr_high.astype(np.float32)
    
    # tmp['MCC'] = tmp['cgi'].apply(lambda x: str(x).split('-')[0])
    # tmp['MNC'] = tmp['cgi'].apply(lambda x: str(x).split('-')[1])
    tmp['ENODEB_ID'] = tmp['cgi'].apply(lambda x: str(x).split('-')[2])
    tmp['CID'] = tmp['cgi'].apply(lambda x: str(x).split('-')[3])
    tmp['ENODEB_ID'] = tmp['ENODEB_ID'].astype(np.int32)
    tmp['CID'] = tmp['CID'].astype(np.int32)

    # tmp['month'] = pd.to_datetime(tmp['time']).dt.month
    # tmp['day'] = pd.to_datetime(tmp['time']).dt.day
    tmp['hour'] = pd.to_datetime(tmp['time']).dt.hour

    tmp = tmp.drop(['city', 'region', 'cgi', 'time'], axis=1)

    return tmp

In [8]:
is_valid = True

In [9]:
if(is_valid):
    train = creat_feature(data, '2018-05-04')
    train = pd.concat([train, creat_feature(data, '2018-05-03')])
    train = pd.concat([train, creat_feature(data, '2018-05-02')])
    train = pd.concat([train, creat_feature(data, '2018-05-01')])
    train = pd.concat([train, creat_feature(data, '2018-04-30')])
    test = creat_feature(data, '2018-05-05')
else:
    train = creat_feature(data, '2018-05-05')
    train = pd.concat([train, creat_feature(data, '2018-05-04')])
    train = pd.concat([train, creat_feature(data, '2018-05-03')])
    train = pd.concat([train, creat_feature(data, '2018-05-02')])
    train = pd.concat([train, creat_feature(data, '2018-05-01')])
    test = creat_feature(data, '2018-05-06')

In [10]:
train.shape

(709513, 28)

In [11]:
test.shape

(142238, 28)

In [12]:
train.columns

Index(['voice_connection', 'wifi_connection', 'voice_disconnection',
       'wifi_disconnection', 'esrvcc_convert', 'voice_convert_1',
       'convert_rate', 'voice_push_miss', 'voice_pull_miss',
       'voice_pull_delay', 'voice_count', 'data_count', 'rrc_connection',
       'erab_connection', 'erab_trash', 'wifi_disconnection_1', 'prb_push',
       'prb_pull', 'rrc_max', 'csgb_rrc', 'rrc_2g', 'rrc_3g', 'rrc_num',
       'mr_low', 'mr_high', 'ENODEB_ID', 'CID', 'hour'],
      dtype='object')

#### stacking

In [13]:
train_y1 = train['mr_low']
train_y2 = train['mr_high']
train_X = train.drop(['mr_low', 'mr_high'], axis=1)

test_y1 = test['mr_low']
test_y2 = test['mr_high']
test_X = test.drop(['mr_low', 'mr_high'], axis=1)

In [14]:
param = {
#     'application': 'binary',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'min_data_in_leaf': 100,
    'learning_rate': 0.1,
    'zero_as_missing': True,
#     'lambda_l1': 1,
    'lambda_l2': 1,
    'metric':{'mse'}
}

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X = train_X
y = train_y1
X_pred = test_X

predictors = [i for i in X.columns]
stacking_num = 2
bagging_num = 3
bagging_test_size = 0.33
num_boost_round = 500
early_stopping_rounds = 100

stacking_model=[]
bagging_model=[]

l2_error = []
#     X = X.values
#     y = y.values
layer_train = np.zeros((X.shape[0],2))

leng = X.shape[0]

for i in range(2):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)

    lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=cat_f)
    lgb_eval = lgb.Dataset(X_test, y_test, categorical_feature=cat_f)

    gbm=lgb.train(param,
                  lgb_train,
                  num_boost_round=1000,
                  valid_sets=lgb_eval,
                  verbose_eval=50,
                  early_stopping_rounds=100)
    stacking_model.append(gbm)
X = np.hstack((X,layer_train[:,1].reshape((-1,1))))

predictors.append('lgb_result')



Training until validation scores don't improve for 100 rounds.
[50]	valid_0's l2: 0.00247395
[100]	valid_0's l2: 0.00221246
[150]	valid_0's l2: 0.00209973
[200]	valid_0's l2: 0.00203727
[250]	valid_0's l2: 0.0019903
[300]	valid_0's l2: 0.00195721
[350]	valid_0's l2: 0.00192989
[400]	valid_0's l2: 0.00190916
[450]	valid_0's l2: 0.00189284
[500]	valid_0's l2: 0.00187536
[550]	valid_0's l2: 0.00186614
[600]	valid_0's l2: 0.00185409
[650]	valid_0's l2: 0.0018436
[700]	valid_0's l2: 0.00183476
[750]	valid_0's l2: 0.00182689
[800]	valid_0's l2: 0.00182098
[850]	valid_0's l2: 0.00181453
[900]	valid_0's l2: 0.00181007
[950]	valid_0's l2: 0.00180497
[1000]	valid_0's l2: 0.00179954
Did not meet early stopping. Best iteration is:
[999]	valid_0's l2: 0.00179953
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's l2: 0.00236631
[100]	valid_0's l2: 0.00212283
[150]	valid_0's l2: 0.0020122
[200]	valid_0's l2: 0.00194976
[250]	valid_0's l2: 0.00190434
[300]	valid_0's l2: 0.00

In [20]:
for bn in range(bagging_num):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=bagging_test_size, random_state=bn)

    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_test, y_test)

    gbm = lgb.train(param,
                    lgb_train,
                    num_boost_round=1000,
                    valid_sets=lgb_eval,
                    verbose_eval=50,
                    early_stopping_rounds=100)
    bagging_model.append(gbm)
    l2_error.append(mean_squared_error(gbm.predict(X_test,num_iteration=gbm.best_iteration),y_test))

    feat_imp = pd.Series(gbm.feature_importance(), predictors).sort_values(ascending=False)

test_pred = np.zeros((X_pred.shape[0],stacking_num))
for sn,gbm in enumerate(stacking_model):
    pred = gbm.predict(X_pred,num_iteration=gbm.best_iteration)
    test_pred[:,sn] = pred

    X_pred = np.hstack((X_pred,test_pred.mean(axis=1).reshape((-1,1))))

for bn,gbm in enumerate(bagging_model):
    pred = gbm.predict(X_pred,num_iteration=gbm.best_iteration)
    if bn==0:
        pred_out = pred
    else:
        pred_out += pred
pre =  pred_out/bagging_num
feat_imp

Training until validation scores don't improve for 100 rounds.
[50]	valid_0's l2: 0.00247073
[100]	valid_0's l2: 0.0022387
[150]	valid_0's l2: 0.00212993
[200]	valid_0's l2: 0.00206864
[250]	valid_0's l2: 0.002033
[300]	valid_0's l2: 0.00199866
[350]	valid_0's l2: 0.00197352
[400]	valid_0's l2: 0.00195466
[450]	valid_0's l2: 0.00194066
[500]	valid_0's l2: 0.00192701
[550]	valid_0's l2: 0.00191546
[600]	valid_0's l2: 0.00190673
[650]	valid_0's l2: 0.00190036
[700]	valid_0's l2: 0.0018906
[750]	valid_0's l2: 0.001881
[800]	valid_0's l2: 0.00187474
[850]	valid_0's l2: 0.00186605
[900]	valid_0's l2: 0.00186182
[950]	valid_0's l2: 0.00185751
[1000]	valid_0's l2: 0.00185428
Did not meet early stopping. Best iteration is:
[994]	valid_0's l2: 0.00185392
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's l2: 0.00238717
[100]	valid_0's l2: 0.00215039
[150]	valid_0's l2: 0.00204213
[200]	valid_0's l2: 0.00197869
[250]	valid_0's l2: 0.00193465
[300]	valid_0's l2: 0.00190

ENODEB_ID               11126
CID                      4554
hour                     3270
data_count               1911
prb_push                 1564
rrc_max                  1318
rrc_num                   760
rrc_2g                    689
prb_pull                  650
rrc_3g                    556
csgb_rrc                  449
esrvcc_convert            407
voice_pull_delay          387
voice_disconnection       345
voice_count               321
voice_push_miss           305
convert_rate              261
voice_pull_miss           249
voice_connection          173
voice_convert_1           136
wifi_disconnection        131
wifi_disconnection_1      124
wifi_connection           107
rrc_connection             97
erab_trash                 61
erab_connection            49
lgb_result                  0
dtype: int64

In [23]:
from sklearn import metrics
valid_auc = metrics.mean_squared_error(test_y1, pre)
print(valid_auc)

0.0048865344900011


In [25]:
for i in range(len(pre)):
    if i>200: break
    print('%d: pre is %f, real is %f' % (i, pre[i], test_y1.tolist()[i]))

0: pre is 0.011593, real is 0.016421
1: pre is 0.012188, real is 0.014527
2: pre is 0.008377, real is 0.015449
3: pre is 0.012789, real is 0.002716
4: pre is 0.013199, real is 0.003485
5: pre is 0.025471, real is 0.020363
6: pre is 0.014480, real is 0.021670
7: pre is 0.011590, real is 0.011759
8: pre is 0.008197, real is 0.018023
9: pre is 0.004469, real is 0.011229
10: pre is 0.010250, real is 0.003256
11: pre is 0.009554, real is 0.014210
12: pre is 0.005104, real is 0.014879
13: pre is 0.003922, real is 0.011437
14: pre is 0.010217, real is 0.017893
15: pre is 0.010289, real is 0.004284
16: pre is 0.014769, real is 0.033241
17: pre is 0.011881, real is 0.016056
18: pre is 0.011121, real is 0.017123
19: pre is 0.007400, real is 0.015262
20: pre is 0.010635, real is 0.003278
21: pre is 0.005849, real is 0.006410
22: pre is 0.009905, real is 0.010714
23: pre is 0.013752, real is 0.003889
24: pre is 0.095254, real is 0.061535
25: pre is 0.096256, real is 0.038024
26: pre is 0.121034, r