#### time window

In [1]:
import lightgbm as lgb
import numpy as np
import pandas as pd

data = pd.read_csv('../input/train.csv')
data['datetime'] = data['time'].apply(lambda x: str(x).split(' ')[0])

In [2]:
# rate 1
posi_f = [
    'voice_connection', 'wifi_connection',
    'voice_convert_1', 'convert_rate',
    'rrc_connection', 'erab_connection',
    'esrvcc_convert'
]

# rate 0
navg_f = [
    'voice_disconnection', 'wifi_disconnection',
    'wifi_disconnection_1',
    'erab_trash', 'prb_pull', 'prb_push'
]

# count 
count_f = [
    'voice_pull_delay','voice_count', 'data_count', 
    'rrc_max', 'csgb_rrc', 'rrc_2g', 'rrc_3g', 'rrc_num',
    'voice_push_miss', 'voice_pull_miss'
]

# drop
drop_f = [
    'video_connection', 'video_disconnection', 'voice_convert_2', 'voice_convert_2', 'pdcch_cce'
]

In [3]:
num_f = posi_f + navg_f + count_f

cat_f = [
    'ENODEB_ID', 'CID', 'hour'
]

In [4]:
def format_(dataframe, feature, fillna='0.0', astype=np.float32, normalize=True):
    dataframe[feature] = dataframe[feature].fillna(fillna)
    dataframe[feature] = dataframe[feature].astype(np.float32)
    if normalize:
        dataframe.loc[dataframe[feature]>1, feature] = 1
        dataframe.loc[dataframe[feature]<0, feature] = 0
    return dataframe

In [5]:
from sklearn.preprocessing import MinMaxScaler
def normalized_feature(dataframe, feature):
    mms = MinMaxScaler()
    return mms.fit_transform(dataframe[feature].values.reshape(-1, 1))

In [6]:
def creat_feature(dataframe, datetime):
    tmp = dataframe[dataframe.loc[:,('datetime')]==datetime]
    tmp = tmp.drop(['datetime'], axis=1)
    
    for feature in posi_f:
        tmp = format_(tmp, feature, fillna=tmp[feature].mean())
    for feature in navg_f:
        tmp = format_(tmp, feature, fillna=tmp[feature].mean())
    for feature in count_f:
        tmp = format_(tmp, feature, fillna=tmp[feature].mean(), normalize=False)
        tmp[feature] = normalized_feature(tmp, feature)
    tmp = tmp.drop(drop_f, axis=1)

    tmp.mr_low = tmp.mr_low.astype(np.float32)
    tmp.mr_high = tmp.mr_high.astype(np.float32)
    
    # tmp['MCC'] = tmp['cgi'].apply(lambda x: str(x).split('-')[0])
    # tmp['MNC'] = tmp['cgi'].apply(lambda x: str(x).split('-')[1])
    tmp['ENODEB_ID'] = tmp['cgi'].apply(lambda x: str(x).split('-')[2])
    tmp['CID'] = tmp['cgi'].apply(lambda x: str(x).split('-')[3])
    tmp['ENODEB_ID'] = tmp['ENODEB_ID'].astype(np.int32)
    tmp['CID'] = tmp['CID'].astype(np.int32)

    # tmp['month'] = pd.to_datetime(tmp['time']).dt.month
    # tmp['day'] = pd.to_datetime(tmp['time']).dt.day
    tmp['hour'] = pd.to_datetime(tmp['time']).dt.hour

    tmp = tmp.drop(['city', 'region', 'cgi', 'time'], axis=1)

    return tmp

In [7]:
is_valid = True

In [8]:
if(is_valid):
    train = creat_feature(data, '2018-05-04')
    train = pd.concat([train, creat_feature(data, '2018-05-03')])
    train = pd.concat([train, creat_feature(data, '2018-05-02')])
    train = pd.concat([train, creat_feature(data, '2018-05-01')])
    train = pd.concat([train, creat_feature(data, '2018-04-30')])
    test = creat_feature(data, '2018-05-05')
else:
    train = creat_feature(data, '2018-05-05')
    train = pd.concat([train, creat_feature(data, '2018-05-04')])
    train = pd.concat([train, creat_feature(data, '2018-05-03')])
    train = pd.concat([train, creat_feature(data, '2018-05-02')])
    train = pd.concat([train, creat_feature(data, '2018-05-01')])
    test = creat_feature(data, '2018-05-06')

In [9]:
train.shape

(6028403, 28)

In [10]:
test.shape

(1214923, 28)

In [11]:
train.columns

Index(['voice_connection', 'wifi_connection', 'voice_disconnection',
       'wifi_disconnection', 'esrvcc_convert', 'voice_convert_1',
       'convert_rate', 'voice_push_miss', 'voice_pull_miss',
       'voice_pull_delay', 'voice_count', 'data_count', 'rrc_connection',
       'erab_connection', 'erab_trash', 'wifi_disconnection_1', 'prb_push',
       'prb_pull', 'rrc_max', 'csgb_rrc', 'rrc_2g', 'rrc_3g', 'rrc_num',
       'mr_low', 'mr_high', 'ENODEB_ID', 'CID', 'hour'],
      dtype='object')

#### stacking

In [12]:
train_y1 = train['mr_low']
train_y2 = train['mr_high']
train_X = train.drop(['mr_low', 'mr_high'], axis=1)

test_y1 = test['mr_low']
test_y2 = test['mr_high']
test_X = test.drop(['mr_low', 'mr_high'], axis=1)

In [13]:
param = {
#     'application': 'binary',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'min_data_in_leaf': 100,
    'learning_rate': 0.1,
    'zero_as_missing': True,
#     'lambda_l1': 1,
    'lambda_l2': 1,
    'metric':{'mse'}
}

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

def fit_predict(X,y,X_pred):
    predictors = [i for i in X.columns]
    stacking_num = 2
    bagging_num = 3
    bagging_test_size = 0.33
    num_boost_round = 500
    early_stopping_rounds = 100
                
    stacking_model=[]
    bagging_model=[]

    l2_error = []
#     X = X.values
#     y = y.values
    layer_train = np.zeros((X.shape[0],2))
    
    leng = X.shape[0]
        
    for i in range(2):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
        
        lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=cat_f)
        lgb_eval = lgb.Dataset(X_test, y_test, categorical_feature=cat_f)
        
        gbm=lgb.train(param,
                      lgb_train,
                      num_boost_round=1000,
                      valid_sets=lgb_eval,
                      verbose_eval=50,
                      early_stopping_rounds=100)
        stacking_model.append(gbm)
    X = np.hstack((X,layer_train[:,1].reshape((-1,1))))
    
    predictors.append('lgb_result')
    
    for bn in range(bagging_num):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=bagging_test_size, random_state=bn)
        
        lgb_train = lgb.Dataset(X_train,y_train, categorical_feature=cat_f)
        lgb_eval = lgb.Dataset(X_test,y_test, categorical_feature=cat_f)
        
        gbm = lgb.train(param,
                        lgb_train,
                        num_boost_round=1000,
                        valid_sets=lgb_eval,
                        verbose_eval=50,
                        early_stopping_rounds=100)
        bagging_model.append(gbm)
        l2_error.append(mean_squared_error(gbm.predict(X_test,num_iteration=gbm.best_iteration),y_test))
        
        feat_imp = pd.Series(gbm.feature_importance(), predictors).sort_values(ascending=False)
        
    test_pred = np.zeros((X_pred.shape[0],stacking_num))
    for sn,gbm in enumerate(stacking_model):
        pred = gbm.predict(X_pred,num_iteration=gbm.best_iteration)
        test_pred[:,sn] = pred
        
        X_pred = np.hstackk((X_pred,test_pred.mean(axis=1).reshape((-1,1))))
        
    for bn,gbm in enumerate(bagging_model):
        pred = gbm.predict(X_pred,num_iteration=gbm.best_iteration)
        if bn==0:
            pred_out = pred
        else:
            pred_out += pred
    return pred_out/bagging_num, feat_imp

In [None]:
pre, imp = fit_predict(train_X, train_y1, test_X)



Training until validation scores don't improve for 100 rounds.
[50]	valid_0's l2: 0.00393412
[100]	valid_0's l2: 0.00349464
[150]	valid_0's l2: 0.00328078
[200]	valid_0's l2: 0.00314694
[250]	valid_0's l2: 0.00305589
[300]	valid_0's l2: 0.00297188
[350]	valid_0's l2: 0.00291256
[400]	valid_0's l2: 0.00285894
[450]	valid_0's l2: 0.00282231
[500]	valid_0's l2: 0.00278722
[550]	valid_0's l2: 0.00275929
[600]	valid_0's l2: 0.00273091
[650]	valid_0's l2: 0.00270544
[700]	valid_0's l2: 0.0026836
[750]	valid_0's l2: 0.00266404
[800]	valid_0's l2: 0.00264816
[850]	valid_0's l2: 0.00263191
[900]	valid_0's l2: 0.00261184
[950]	valid_0's l2: 0.00260179
[1000]	valid_0's l2: 0.00259201
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00259201
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's l2: 0.00385582
[100]	valid_0's l2: 0.00340345
[150]	valid_0's l2: 0.0031856
[200]	valid_0's l2: 0.0030441
[250]	valid_0's l2: 0.00295668
[300]	valid_0's l2: 0.0