# baseline_embedding

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import pickle
import datetime
import lightgbm as lgb
from sklearn.decomposition import KernelPCA

%matplotlib inline

from contextlib import contextmanager

@contextmanager
def timer(name):
    start_time = time.time()
    yield
    print(f'[{name} done in {time.time() - start_time:.2f} s]')

In [4]:
def zuhe(line):
    line['hour'] = line['hour'].astype(str)
    line['weekday'] = line['weekday'].astype(str)
    line['s_ij'] = line['s_ij'].astype(str)
    line['e_ij'] = line['e_ij'].astype(str)

#     line['is_peek'] = line['is_peek'].astype(str)
    line['is_crowd'] = line['is_crowd'].astype(str)
                                   
    line['hour_weekday'] = line['hour'] + line['weekday']
    line['hour_crowd'] = line['hour'] + line['is_crowd']
    line['hour_s'] = line['hour'] + line['s_ij']
    line['hour_e'] = line['hour'] + line['e_ij']

    line['hour'] = line['hour'].astype(int)
#     line['is_peek'] = line['is_peek'].astype(int)
    line['is_crowd'] = line['is_crowd'].astype(int)
    line['weekday'] = line['weekday'].astype(int)
    line['s_ij'] = line['s_ij'].astype(int)
    line['e_ij'] = line['e_ij'].astype(int)
    
    line['hour_weekday'] = line['hour_weekday'].astype(int)
    line['hour_crowd'] = line['hour_crowd'].astype(int)
    line['hour_s'] = line['hour_s'].astype(int)
    line['hour_e'] = line['hour_e'].astype(int)
    return line

with open('./data/train-id4-crowd-grid3.txt', 'rb') as data_file:
    line = pickle.load(data_file)
    line = zuhe(line)
line = line[(line['Diff_Time']<600)]
line['ID'] = line['ID'].astype('category')
line['s_ij'] = line['s_ij'].astype('category') 
line['e_ij'] = line['e_ij'].astype('category')
print('read train finished........')

info = pd.read_csv('data/all_svd_info.csv',index_col=0)

k_pca_1 = KernelPCA(n_components=128)
k_pca_1.fit(info)
embedding_res = k_pca_1.transform(info)
embedding_res = pd.DataFrame(data=embedding_res,index=info.index)

print('finished create embedding.....')

train = pd.merge(line, embedding_res, how='left', left_on='O_LINENO',right_index=True)
col = [c for c in train if
       c not in ['Unnamed: 0','ID2','s_x', 's_y', 'e_x', 'e_y','O_LINENO', 'O_UP', 'Source_Station', 'Target_Station','O_TIME', 'aver_v','max_v', 'Diff_Time']]
X = train[col]
y = train['Diff_Time'].values
# y_norm = y / 600

print('finished create X,y ................ ')

with open('./data/test-id4-crowd-grid3.txt', 'rb') as data_file:
    test = pickle.load(data_file)
test = pd.merge(test, embedding_res, how='left', left_on='O_LINENO',right_index=True)
col1 = [c for c in test if
       c not in ['Unnamed: 0','ID2' , 's_x', 's_y', 'e_x', 'e_y','O_LINENO', 'O_UP', 'Source_Station', 'Target_Station', 'O_TIME', 'aver_v', 'max_v',
                 'Diff_Time','Distance1', 'distance2','TERMINALNO', 'new_dist']]
X_test = test[col1]

print('finished loading X_test ................ ')

read train finished........
finished create embedding.....
finished create X,y ................ 
finished loading X_test ................ 


In [5]:
def train2(X,y,X_test,n_folds):
    
    def mse_v2(y_pred, train_data):
        y_true = train_data.get_label()
        return 'rmse', 600*(np.mean((y_pred-y_true)**2))**0.5, False
        
    from sklearn.model_selection import StratifiedKFold, KFold
    print("folding")
    kf = KFold(n_splits=n_folds,shuffle=True,random_state=2018)
    result = np.zeros((len(X_test), 1))
    print("training")
    count = 0
    for (tr_idx, val_idx) in kf.split(y):
        X_train = X.iloc[tr_idx]
        y_train = y[tr_idx]

        X_dev = X.iloc[val_idx]
        y_dev = y[val_idx]
        
        print(X_train.shape)
        print(X_dev.shape)
        
        lgb_train  = lgb.Dataset(X_train, y_train,)#feature_name=col,categorical_feature=['ID']
        lgb_eval = lgb.Dataset(X_dev, y_dev,reference=lgb_train)
        params = {'num_leaves':60, 
                  'max_depth':8,
                  'seed':2018,
                  'colsample_bytree':0.5,
                  'subsample':0.9,
                  'num_threads':32,
                  'n_estimators':20000,
                  'learning_rate': 0.1,
                  #'objective':'regression_l2',  
                  'objective': 'mse',
                  'metric':'rmse',
                'device_type':'gpu',
                 }
        print(count)
        gbm = lgb.train(params, lgb_train,early_stopping_rounds=200,valid_sets=lgb_eval,
                        verbose_eval=50,
                        #feval=mse_v2
                       )
        
        if n_fold==0:
            feature_score = pd.DataFrame()
            feature_score['name'] = gbm.feature_name()
            feature_score['importance1'] = gbm.feature_importance()
            feature_score['importance2'] = gbm.feature_importance('gain')
            feature_score['importance3'] = feature_score['importance2']/(feature_score['importance1']**0.5)
            feature_score.dropna(inplace=True)
            feature_score = feature_score.sort_values(by=['importance3','importance2','importance1'],ascending=False)
            print(feature_score)
            
        resultx = gbm.predict(X_test, num_iteration=gbm.best_iteration)
        resultx = np.reshape(resultx,(490380, 1))
        print(resultx.shape,result.shape)
        result = result + resultx
        y_te_pred = gbm.predict(X_dev, num_iteration=gbm.best_iteration)
        #print(log_loss(y_dev, y_te_pred))

        count = count+1
    # 提交结果
    result /= n_folds
    return result

In [7]:
pred = train2(X,y,X_test,n_folds=10)

folding
training
(12360197, 151)
(1373356, 151)
0




Training until validation scores don't improve for 200 rounds.
[50]	valid_0's rmse: 39.8349
[100]	valid_0's rmse: 39.0357
[150]	valid_0's rmse: 38.7489
[200]	valid_0's rmse: 38.5946
[250]	valid_0's rmse: 38.4736
[300]	valid_0's rmse: 38.3943
[350]	valid_0's rmse: 38.3141
[400]	valid_0's rmse: 38.2519
[450]	valid_0's rmse: 38.2013
[500]	valid_0's rmse: 38.1466
[550]	valid_0's rmse: 38.0982
[600]	valid_0's rmse: 38.0562
[650]	valid_0's rmse: 38.0106
[700]	valid_0's rmse: 37.9837
[750]	valid_0's rmse: 37.9476
[800]	valid_0's rmse: 37.9154
[850]	valid_0's rmse: 37.8862
[900]	valid_0's rmse: 37.8563
[950]	valid_0's rmse: 37.8246
[1000]	valid_0's rmse: 37.7979
[1050]	valid_0's rmse: 37.7693
[1100]	valid_0's rmse: 37.7418
[1150]	valid_0's rmse: 37.7158
[1200]	valid_0's rmse: 37.6866
[1250]	valid_0's rmse: 37.6573
[1300]	valid_0's rmse: 37.6373
[1350]	valid_0's rmse: 37.6168
[1400]	valid_0's rmse: 37.5933
[1450]	valid_0's rmse: 37.5717
[1500]	valid_0's rmse: 37.5511
[1550]	valid_0's rmse: 37.5

KeyboardInterrupt: 

In [10]:

print(train)

          Unnamed: 0          ID  O_LINENO  O_UP  Source_Station  \
0                  1    80412021       804     1              20   
1                  2    80412122       804     1              21   
2                  3    80412223       804     1              22   
3                  4    80412324       804     1              23   
4                  5    80412425       804     1              24   
6                  7    80400304       804     0               3   
7                  8    80400405       804     0               4   
8                  9    80400506       804     0               5   
9                 10    80400607       804     0               6   
10                11    80400708       804     0               7   
11                12    80400809       804     0               8   
12                13    80400910       804     0               9   
13                14    80401011       804     0              10   
14                15    80401112       804     0