In [30]:
import pickle
import lightgbm as lgb
import pandas as pd
from sklearn.utils import shuffle
import numpy as np
from tqdm import tqdm

In [12]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

In [80]:
with open('./data/train-id4-crowd-grid4.txt', 'rb') as data_file:
    line = pickle.load(data_file)

with open('./data/test-id4-crowd-grid4.txt', 'rb') as data_file:
    test = pickle.load(data_file)

# 共享区间

In [27]:
line.head()

Unnamed: 0.1,Unnamed: 0,ID,O_LINENO,O_UP,Source_Station,Target_Station,Distance,O_TIME,aver_v,hour,...,e_x,e_y,is_crowd,s_ij,e_ij,ID2,grid_aver_diff,grid_aver_d,Source_Station_encode,Target_Station_encode
0,1,80412021,804,1,20,21,0.629329,2017-10-09 06:31:28,16.182736,6,...,39.14658,117.1904,0,569,569,80405690569,215.775253,0.72345,3500,66
1,2,80412122,804,1,21,22,0.983431,2017-10-09 06:35:18,15.392834,6,...,39.1421,117.184,0,569,569,80405690569,241.289308,0.949461,66,61
2,3,80412223,804,1,22,23,0.690608,2017-10-09 06:36:28,35.516963,6,...,39.1395,117.18,1,569,569,80405690569,124.409653,0.644947,61,208
3,4,80412324,804,1,23,24,0.608654,2017-10-09 06:37:38,31.30219,6,...,39.1358,117.181,1,569,569,80405690569,162.265997,0.580119,208,2466
4,5,80412425,804,1,24,25,0.362356,2017-10-09 06:38:28,26.089661,6,...,39.1338,117.184,0,569,569,80405690569,84.416979,0.488454,2466,33


In [38]:
region = set()
for i in zip(line.Source_Station_encode, line.Target_Station_encode):
    region.add(i)

In [43]:
region = {v:k for k,v in enumerate(region)}

In [79]:
len(region)

10894

In [13]:
line_embedding = pd.read_csv('line_vec.csv')

In [14]:
station_embedding = pd.read_csv('station_vec.csv')

In [83]:
# with open('station_map.pkl','rb') as f:
#     station_map = pickle.load(f)

In [84]:
# line['Source_Station_encode'] = line.apply(lambda x:station_map.get((x['O_LINENO'], x['O_UP'], x['Source_Station']), -1), axis=1)
# line['Target_Station_encode'] = line.apply(lambda x:station_map.get((x['O_LINENO'], x['O_UP'], x['Target_Station']), -1), axis=1)

In [85]:
# test['Source_Station_encode'] = test.apply(lambda x:station_map.get((x['O_LINENO'], x['O_UP'], x['Source_Station']),-1), axis=1)
# test['Target_Station_encode'] = test.apply(lambda x:station_map.get((x['O_LINENO'], x['O_UP'], x['Target_Station']),-1), axis=1)

In [45]:
# line['region_encode'] = line.apply(lambda x:region.get((x['Source_Station_encode'], x['Target_Station_encode']), -1), axis=1)

In [50]:
col = [c for c in line if
       c not in ['Unnamed: 0','ID2','s_x', 's_y', 'e_x', 'e_y','O_LINENO', 'O_UP', 'Source_Station', 'Target_Station','O_TIME', 'aver_v','max_v', 'Diff_Time']]

In [68]:
y = line['Diff_Time'].values

In [77]:
noice = np.random.randint(-3,3,y.shape)

In [78]:
noice

array([-3, -1,  0, ..., -2,  1,  2])

In [46]:
test['region_encode'] = test.apply(lambda x:region.get((x['Source_Station_encode'], x['Target_Station_encode']), -1), axis=1)

In [87]:
with open('./data/train-id4-crowd-grid_hadxu.txt', 'wb') as data_file:
    pickle.dump(line, data_file)
    
with open('./data/test-id4-crowd-grid_hadxu.txt', 'wb') as data_file:
    pickle.dump(test, data_file)

# with open('./data/train-id4-crowd-grid_qrf.txt', 'rb') as data_file:
#     line = pickle.load(data_file)

# with open('./data/test-id4-crowd-grid_qrf.txt', 'rb') as data_file:
#     test = pickle.load(data_file)

In [19]:
line = pd.merge(line,line_embedding,left_on='O_LINENO', right_on='line' )
test = pd.merge(test,line_embedding,left_on='O_LINENO', right_on='line' )

In [23]:
line  = downcast_dtypes(line)
test = downcast_dtypes(test)

In [88]:
with open('./data/train-id4-crowd-grid_hadxu.txt', 'wb') as data_file:
    pickle.dump(line, data_file)
    
with open('./data/test-id4-crowd-grid_hadxu.txt', 'wb') as data_file:
    pickle.dump(test, data_file)

In [23]:
line.columns

Index(['Unnamed: 0', 'ID', 'O_LINENO', 'O_UP', 'Source_Station',
       'Target_Station', 'Distance', 'distance2', 'O_TIME', 'hour', 'is_peek',
       'weekday', 'is_workday', 'dws', 'nws', 'dts', 'nts', 'is_rain', 'max_v',
       'h_aver_diff', 'h_aver_d', 'h_aver_v', 'TERMINALNO', 'new_dist', 's_x',
       's_y', 'e_x', 'e_y', 'is_crowd', 's_ij', 'e_ij', 'ID2',
       'grid_aver_diff', 'grid_aver_d', 'Source_Station_encode',
       'Target_Station_encode', 'hour_weekday', 'hour_crowd', 'hour_s',
       'hour_e'],
      dtype='object')

In [89]:
def zuhe(line):
    line['hour'] = line['hour'].astype(str)
    line['weekday'] = line['weekday'].astype(str)
    line['s_ij'] = line['s_ij'].astype(str)
    line['e_ij'] = line['e_ij'].astype(str)

#     line['is_peek'] = line['is_peek'].astype(str)
    line['is_crowd'] = line['is_crowd'].astype(str)
                                   
    line['hour_weekday'] = line['hour'] + line['weekday']
    line['hour_crowd'] = line['hour'] + line['is_crowd']
    line['hour_s'] = line['hour'] + line['s_ij']
    line['hour_e'] = line['hour'] + line['e_ij']

    line['hour'] = line['hour'].astype(int)
#     line['is_peek'] = line['is_peek'].astype(int)
    line['is_crowd'] = line['is_crowd'].astype(int)
    line['weekday'] = line['weekday'].astype(int)
    line['s_ij'] = line['s_ij'].astype(int)
    line['e_ij'] = line['e_ij'].astype(int)
    
    line['hour_weekday'] = line['hour_weekday'].astype(int)
    line['hour_crowd'] = line['hour_crowd'].astype(int)
    line['hour_s'] = line['hour_s'].astype(int)
    line['hour_e'] = line['hour_e'].astype(int)
    return line

with open('./data/train-id4-crowd-grid_hadxu.txt', 'rb') as data_file:
    line = pickle.load(data_file)
    line = zuhe(line)#13963746
line = line[(line['Diff_Time']<600)]
line['ID'] = line['ID'].astype('category')
line['s_ij'] = line['s_ij'].astype('category') 
line['e_ij'] = line['e_ij'].astype('category')

line['Source_Station_encode'] = line['Source_Station_encode'].astype('category') 
line['Target_Station_encode'] = line['Target_Station_encode'].astype('category') 

train = line
col = [c for c in train if
       c not in ['Unnamed: 0','ID2','s_x', 's_y', 'e_x', 'e_y','O_LINENO', 'O_UP', 'Source_Station', 'Target_Station','O_TIME', 'aver_v','max_v', 'Diff_Time']]
print(col)
X = train[col].values
y = train['Diff_Time'].values
print(X.shape)

# line = line[(line['Diff_Time']<600)]
# line['ID'] = line['ID'].astype('category')
# line['s_ij'] = line['s_ij'].astype('category') 
# line['e_ij'] = line['e_ij'].astype('category') 
# line = shuffle(line)
# train_num = int(0.9 * line.shape[0])
# train = line[:train_num]
# #!
# dev = line[train_num:]
# print(train.shape)
# print(dev.shape)
with open('./data/test-id4-crowd-grid_hadxu.txt', 'rb') as data_file:
    test = pickle.load(data_file)
test = zuhe(test)

col1 = [c for c in test if
       c not in ['Unnamed: 0','ID2' , 's_x', 's_y', 'e_x', 'e_y','O_LINENO', 'O_UP', 'Source_Station', 'Target_Station', 'O_TIME', 'aver_v', 'max_v',
                 'Diff_Time','Distance1', 'distance2','TERMINALNO', 'new_dist']]
print(col1)
X_test = test[col1].values
print(X_test.shape)

['ID', 'Distance', 'hour', 'is_peek', 'weekday', 'is_workday', 'dws', 'nws', 'dts', 'nts', 'is_rain', 'h_aver_diff', 'h_aver_d', 'h_aver_v', 'is_crowd', 's_ij', 'e_ij', 'grid_aver_diff', 'grid_aver_d', 'Source_Station_encode', 'Target_Station_encode', 'hour_weekday', 'hour_crowd', 'hour_s', 'hour_e']
(13733553, 25)
['ID', 'Distance', 'hour', 'is_peek', 'weekday', 'is_workday', 'dws', 'nws', 'dts', 'nts', 'is_rain', 'h_aver_diff', 'h_aver_d', 'h_aver_v', 'is_crowd', 's_ij', 'e_ij', 'grid_aver_diff', 'grid_aver_d', 'Source_Station_encode', 'Target_Station_encode', 'hour_weekday', 'hour_crowd', 'hour_s', 'hour_e']
(490380, 25)


In [25]:
def train2(X,y,X_test,n_folds):
    
    def mse_v2(y_pred, train_data):
        y_true = train_data.get_label()
        return 'rmse', 600*(np.mean((y_pred-y_true)**2))**0.5, False
    
    res = np.zeros((len(X_test), n_folds))
    
    from sklearn.model_selection import StratifiedKFold, KFold
    print("folding")
    kf = KFold(n_splits=n_folds,shuffle=True,random_state=2018)
    result = np.zeros((len(X_test), 1))
    print("training")
    count = 0
    for (tr_idx, val_idx) in kf.split(y):
        X_train = X[tr_idx]
        y_train = y[tr_idx]

        X_dev = X[val_idx]
        y_dev = y[val_idx]
        
        print(X_train.shape)
        print(X_dev.shape)
        
        lgb_train  = lgb.Dataset(X_train, y_train,)#feature_name=col,categorical_feature=['ID']
        lgb_eval = lgb.Dataset(X_dev, y_dev,reference=lgb_train)
        params = {'num_leaves':60, 
                  'max_depth':8,
                  'seed':2018,
                  'colsample_bytree':0.8,
                  'subsample':0.9,
                  'num_threads':4,
                  'n_estimators':25000,
                  'learning_rate': 0.1,
                  'objective':'regression_l2',  
                  # 'objective': 'xentropy',
                  'metric':'rmse',
                'device_type':'gpu',
                 }
        print(count)
        gbm = lgb.train(params, lgb_train,early_stopping_rounds=200,valid_sets=lgb_eval,
                        verbose_eval=50,
                       # feval=mse_v2
                       )

        resultx = gbm.predict(X_test, num_iteration=gbm.best_iteration)
        resultx = np.reshape(resultx,(490380, 1))
        print(resultx.shape,result.shape)
        
        res[:, count] = resultx[:, 0]
        
        result = result + resultx
        y_te_pred = gbm.predict(X_dev, num_iteration=gbm.best_iteration)
        #print(log_loss(y_dev, y_te_pred))

        count = count+1
    # 提交结果
    result /= n_folds
    return result, res

In [90]:
def train_1cv(X,y,X_test):
    
    def mse_v2(y_pred, train_data):
        y_true = train_data.get_label()
        return 'rmse', 600*(np.mean((y_pred-y_true)**2))**0.5, False
    
    result = np.zeros((len(X_test), 1))
    print("training")
    count = 0
    
    from sklearn.model_selection import train_test_split
    X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.2, random_state=42)
    
    print(X_train.shape)
    print(X_dev.shape)
        
    lgb_train  = lgb.Dataset(X_train, y_train,)#feature_name=col,categorical_feature=['ID']
    lgb_eval = lgb.Dataset(X_dev, y_dev,reference=lgb_train)
    params = {'num_leaves':60, 
              'max_depth':8,
              'seed':2018,
              'colsample_bytree':0.8,
              'subsample':0.9,
              'num_threads':4,
              'n_estimators':20000,
              'learning_rate': 0.1,
              'objective':'regression_l2',  
              # 'objective': 'xentropy',
              'metric':'rmse',
            'device_type':'gpu',
             }
    gbm = lgb.train(params, lgb_train,early_stopping_rounds=200,valid_sets=lgb_eval,
                        verbose_eval=50,
                       # feval=mse_v2
                       )

    resultx = gbm.predict(X_test, num_iteration=gbm.best_iteration)
    resultx = np.reshape(resultx,(490380, 1))
    print(resultx.shape,result.shape)
    result = result + resultx
    return result

In [None]:
pred = train_1cv(X,y,X_test)

training
(10986842, 25)
(2746711, 25)




Training until validation scores don't improve for 200 rounds.
[50]	valid_0's rmse: 39.7753
[100]	valid_0's rmse: 39.0458
[150]	valid_0's rmse: 38.7849
[200]	valid_0's rmse: 38.629
[250]	valid_0's rmse: 38.5078
[300]	valid_0's rmse: 38.4036
[350]	valid_0's rmse: 38.3226
[400]	valid_0's rmse: 38.2337
[450]	valid_0's rmse: 38.1577
[500]	valid_0's rmse: 38.0921
[550]	valid_0's rmse: 38.0391
[600]	valid_0's rmse: 37.9957
[650]	valid_0's rmse: 37.9447
[700]	valid_0's rmse: 37.8938
[750]	valid_0's rmse: 37.8498
[800]	valid_0's rmse: 37.81
[850]	valid_0's rmse: 37.7671
[900]	valid_0's rmse: 37.738
[950]	valid_0's rmse: 37.6957
[1000]	valid_0's rmse: 37.6625
[1050]	valid_0's rmse: 37.6277
[1100]	valid_0's rmse: 37.5952
[1150]	valid_0's rmse: 37.5608
[1200]	valid_0's rmse: 37.5302
[1250]	valid_0's rmse: 37.5025
[1300]	valid_0's rmse: 37.4797
[1350]	valid_0's rmse: 37.4512
[1400]	valid_0's rmse: 37.4213
[1450]	valid_0's rmse: 37.393
[1500]	valid_0's rmse: 37.3662
[1550]	valid_0's rmse: 37.3466
[

In [None]:
pred, res = train2(X,y,X_test, n_folds=10)

folding
training
(12360197, 25)
(1373356, 25)
0




Training until validation scores don't improve for 200 rounds.
[50]	valid_0's rmse: 39.7144
[100]	valid_0's rmse: 38.9846
[150]	valid_0's rmse: 38.7258
[200]	valid_0's rmse: 38.5834
[250]	valid_0's rmse: 38.4587
[300]	valid_0's rmse: 38.338
[350]	valid_0's rmse: 38.2422
[400]	valid_0's rmse: 38.1602
[450]	valid_0's rmse: 38.0905
[500]	valid_0's rmse: 38.0255
[550]	valid_0's rmse: 37.9699
[600]	valid_0's rmse: 37.9111
[650]	valid_0's rmse: 37.8529
[700]	valid_0's rmse: 37.8092
[750]	valid_0's rmse: 37.762
[800]	valid_0's rmse: 37.7143
[850]	valid_0's rmse: 37.6723
[900]	valid_0's rmse: 37.6387
[950]	valid_0's rmse: 37.6008
[1000]	valid_0's rmse: 37.5703
[1050]	valid_0's rmse: 37.5346
[1100]	valid_0's rmse: 37.5022
[1150]	valid_0's rmse: 37.4757
[1200]	valid_0's rmse: 37.4526
[1250]	valid_0's rmse: 37.4311
[1300]	valid_0's rmse: 37.4073
[1350]	valid_0's rmse: 37.3806
[1400]	valid_0's rmse: 37.3478
[1450]	valid_0's rmse: 37.3193
[1500]	valid_0's rmse: 37.2951
[1550]	valid_0's rmse: 37.270

[13150]	valid_0's rmse: 35.8461
[13200]	valid_0's rmse: 35.8446
[13250]	valid_0's rmse: 35.8428
[13300]	valid_0's rmse: 35.8412
[13350]	valid_0's rmse: 35.8394
[13400]	valid_0's rmse: 35.8365
[13450]	valid_0's rmse: 35.8347
[13500]	valid_0's rmse: 35.8325
[13550]	valid_0's rmse: 35.83
[13600]	valid_0's rmse: 35.828
[13650]	valid_0's rmse: 35.8264
[13700]	valid_0's rmse: 35.8233
[13750]	valid_0's rmse: 35.8202
[13800]	valid_0's rmse: 35.8177
[13850]	valid_0's rmse: 35.8165
[13900]	valid_0's rmse: 35.814
[13950]	valid_0's rmse: 35.8113
[14000]	valid_0's rmse: 35.8097
[14050]	valid_0's rmse: 35.8072
[14100]	valid_0's rmse: 35.8055
[14150]	valid_0's rmse: 35.804
[14200]	valid_0's rmse: 35.8031
[14250]	valid_0's rmse: 35.8016
[14300]	valid_0's rmse: 35.8003
[14350]	valid_0's rmse: 35.7983
[14400]	valid_0's rmse: 35.7965
[14450]	valid_0's rmse: 35.7936
[14500]	valid_0's rmse: 35.7925
[14550]	valid_0's rmse: 35.7914
[14600]	valid_0's rmse: 35.7907
[14650]	valid_0's rmse: 35.7893
[14700]	valid

In [25]:
with open('./data/test-id4-crowd-grid4.txt', 'rb') as data_file:
    test = pickle.load(data_file)
test = zuhe(test)
test = test[['ID', 'new_dist',  'O_LINENO', 'O_UP', 'Source_Station', 'Target_Station',
       'Distance', 'distance2', 'O_TIME', 'hour', 'is_peek', 'weekday',
       'is_workday', 'dws', 'nws', 'dts', 'nts', 'is_rain', 'max_v',
       'h_aver_diff', 'h_aver_d', 'h_aver_v', 'TERMINALNO', 
       'is_crowd', 's_ij', 'e_ij','grid_aver_diff', 'grid_aver_d', 'hour_weekday', 'hour_crowd', 'hour_s', 'hour_e']]
test.columns=['ID', 'Distance',  'O_LINENO', 'O_UP', 'Source_Station', 'Target_Station',
       'Distance1', 'distance2', 'O_TIME', 'hour', 'is_peek', 'weekday',
       'is_workday', 'dws', 'nws', 'dts', 'nts', 'is_rain', 'max_v',
       'h_aver_diff', 'h_aver_d', 'h_aver_v', 'TERMINALNO', 
       'is_crowd', 's_ij', 'e_ij','grid_aver_diff', 'grid_aver_d', 'hour_weekday', 'hour_crowd', 'hour_s', 'hour_e']
col1 = [c for c in test if
       c not in ['Unnamed: 0','ID2' , 's_x', 's_y', 'e_x', 'e_y','O_LINENO', 'O_UP', 'Source_Station', 'Target_Station', 'O_TIME', 'aver_v', 'max_v',
                 'Diff_Time','Distance1', 'distance2','TERMINALNO', 'new_dist']]
print(col1)

test['pred1'] = pred
test['pred2'] = pred

sub1 = test[['O_LINENO','TERMINALNO', 'O_UP','Source_Station','Target_Station','O_TIME','pred1','pred2','Distance','Distance1']]
sub1['O_TIME'] = pd.to_datetime(sub1['O_TIME'],format='%Y-%m-%d %H:%M:%S')
sub1.columns = ['LINE','TERMINALNO','UP','pred_start_stop_ID','pred_end_stop_ID','realTime','pred1','pred2','Distance','Distance1']
sub1 = sub1.reset_index()
del sub1['index']

sub=pd.read_csv("./toBePredicted_0607_segment.csv", sep=",")
sub['realTime'] = pd.to_datetime(sub['realTime'],format='%Y-%m-%d %H:%M:%S')
sub2 = sub[['LINE','TERMINALNO','UP','pred_start_stop_ID','pred_end_stop_ID','realTime','distance']]
sub2=pd.merge(sub2,sub1,on=['LINE','TERMINALNO','UP','pred_start_stop_ID','pred_end_stop_ID','realTime'],how='left')


sub2['div_dist'] = sub2['Distance'] / sub2['Distance1']
sub2['new_pred'] = sub2['div_dist'] * sub2['pred1']

import math
for i in range(sub2.shape[0]):
    s = sub2.iloc[i]
    if math.isnan(s['div_dist']):
        sub2.loc[i,'new_pred'] = s['pred1']
        
sub2.to_csv('./toBePredicted_0807_result.csv',sep=",",index=False)#0605 29.2467

print('finished')

['ID', 'Distance', 'hour', 'is_peek', 'weekday', 'is_workday', 'dws', 'nws', 'dts', 'nts', 'is_rain', 'h_aver_diff', 'h_aver_d', 'h_aver_v', 'is_crowd', 's_ij', 'e_ij', 'grid_aver_diff', 'grid_aver_d', 'hour_weekday', 'hour_crowd', 'hour_s', 'hour_e']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


finished


In [92]:
x = pd.read_csv('toBePredicted_0813y.csv')
y = pd.read_csv('toBePredicted_0813x.csv')

In [94]:
shape = x['new_pred'].shape

In [95]:
shape

(490380,)

In [97]:
noise_x = np.random.randint(-3,3, shape)
noise_y = np.random.randint(-2,2, shape)

x['new_pred'] = (x['new_pred']+noise_x + y['new_pred']+noise_y) / 2

In [99]:
x.to_csv('hadxu_noise_emsemble.csv', index=False)