In [1]:
# -*- coding:utf8 -*-
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from math import sqrt
import warnings
import gc
import subprocess
import re
warnings.filterwarnings('ignore')

# path_train = "/data/dm/train.csv"  # 训练文件
# path_test = "/data/dm/test.csv"  # 测试文件
path_train = "../resource/PINGAN-2018-train_demo.csv"
path_test = "../resource/PINGAN-2018-test_demo.csv"

path_test_out = "model/"  # 预测结果输出路径为model/xx.csv,有且只能有一个文件并且是CSV格式。

all_feature_list = ['CALLSTATE', 'DIRECTION', 'HEIGHT', 'LATITUDE', 'LONGITUDE', 'SPEED',
       'TERMINALNO', 'TIME', 'TRIP_ID', 'Y', 'time', 'date', 'hour', 'minute',
       'trip_max', 'lon_max', 'lon_min', 'lon', 'lat_max', 'lat_min', 'lat',
       'heg_max', 'heg_min', 'heg_mean', 'heg', 'vol', 'sp_max', 'sp_mean',
       'call0', 'call1', 'call_ratio_0', 'call_ratio_1', 'dis', 'ave_dri_time', 'dri_time']

use_feature_list = [
    'trip_max', 'lon_max', 'lon_min', 'lon_50', 'lat_max',
    'lat_min', 'lat_50', 'heg_max', 'heg_min', 'heg_mean', 'heg_50',
    'sp_max', 'sp_mean', 'sp_50', 'dis', 'avg_dis',
    'ave_dri_time', 'dri_time'
]




# 查看内存
keydic = {"MemTotal":"TotalMem","MemFree":"FreeMem","MemAvailable":"AvaiableMem","Cached":"Cached"}
def command(command):
    p = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    resultDic = {}
    for line in p.stdout.readlines():
        line = str(line,encoding="utf-8")
        result = re.split("\s*",line)
        if result[0][:-1] in keydic:
            resultDic[keydic[result[0][:-1]]] = "%.2f" %(int(result[1])/(1024**2))
    return resultDic

def load_data(path_train,path_test):
    train_data = pd.read_csv(path_train)
    test_data = pd.read_csv(path_test)
    return train_data,test_data

# def read_csv():
#     """
#     文件读取模块，头文件见columns.
#     :return:
#     """
#     # for filename in os.listdir(path_train):
#     tempdata = pd.read_csv(path_train)
#     tempdata.columns = ["TERMINALNO", "TIME", "TRIP_ID", "LONGITUDE", "LATITUDE", "DIRECTION", "HEIGHT", "SPEED",
#                         "CALLSTATE", "Y"]

#时间处理
def time_datetime(value):
    format = '%Y%m%d%H%M'
    value = time.localtime(value)
    dt = time.strftime(format, value)
    return int(dt)

def time_date(value):
    format = '%Y%m%d'
    value = time.localtime(value)
    dt = time.strftime(format, value)
    return int(dt)

def time_hour(value):
    format = '%H'
    value = time.localtime(value)
    dt = time.strftime(format, value)
    return int(dt)

def time_minute(value):
    format = '%M'
    value = time.localtime(value)
    dt = time.strftime(format, value)
    return int(dt)
#驾驶时长转换
def f(x):
    if x >= 20:
        return 0
    else:
        return x

In [2]:
# all feature
feature = pd.DataFrame()

# 1.trip_max
train_data = pd.read_csv(path_train, usecols=['TERMINALNO', 'TIME', 'TRIP_ID'])
test_data = pd.read_csv(path_test, usecols=['TERMINALNO', 'TIME', 'TRIP_ID'])

TRAIN_ID_MAX = train_data['TERMINALNO'].max() + 10
test_data['TERMINALNO'] = test_data['TERMINALNO'] + TRAIN_ID_MAX
data = pd.concat([train_data, test_data])
data.drop_duplicates(inplace=True, subset=['TERMINALNO', 'TIME'])

feature[['TERMINALNO', 'trip_max']] = data['TRIP_ID'].groupby(data['TERMINALNO']).max().reset_index()
del train_data, test_data, data
gc.collect()

# 2.lon_max lon_min lon_50
train_data = pd.read_csv(path_train, usecols=['TERMINALNO', 'LONGITUDE'])
test_data = pd.read_csv(path_test, usecols=['TERMINALNO', 'LONGITUDE'])

test_data['TERMINALNO'] = test_data['TERMINALNO'] + TRAIN_ID_MAX
data = pd.concat([train_data, test_data])
data.drop_duplicates(inplace=True)

feature[['TERMINALNO', 'lon_max']] = pd.DataFrame(data['LONGITUDE'].groupby(data['TERMINALNO']).max()).reset_index()
feature[['TERMINALNO', 'lon_min']] = pd.DataFrame(data['LONGITUDE'].groupby(data['TERMINALNO']).min()).reset_index()
feature[['TERMINALNO', 'lon_50']] = pd.DataFrame(
    data['LONGITUDE'].groupby(data['TERMINALNO']).quantile()).reset_index()
del train_data, test_data, data
gc.collect()

# 3.lat_max lat_min lat_50
train_data = pd.read_csv(path_train, usecols=['TERMINALNO', 'LATITUDE'])
test_data = pd.read_csv(path_test, usecols=['TERMINALNO', 'LATITUDE'])

test_data['TERMINALNO'] = test_data['TERMINALNO'] + TRAIN_ID_MAX
data = pd.concat([train_data, test_data])
data.drop_duplicates(inplace=True)

feature[['TERMINALNO', 'lat_max']] = pd.DataFrame(data['LATITUDE'].groupby(data['TERMINALNO']).max()).reset_index()
feature[['TERMINALNO', 'lat_min']] = pd.DataFrame(data['LATITUDE'].groupby(data['TERMINALNO']).min()).reset_index()
feature[['TERMINALNO', 'lat_50']] = pd.DataFrame(
    data['LATITUDE'].groupby(data['TERMINALNO']).quantile()).reset_index()
del train_data, test_data, data
gc.collect()

# 4.heg_max heg_min heg_mean heg_50
train_data = pd.read_csv(path_train, usecols=['TERMINALNO', 'HEIGHT'])
test_data = pd.read_csv(path_test, usecols=['TERMINALNO', 'HEIGHT'])

test_data['TERMINALNO'] = test_data['TERMINALNO'] + TRAIN_ID_MAX
data = pd.concat([train_data, test_data])
data.drop_duplicates(inplace=True)
data.fillna(0.0, inplace=True)

feature[['TERMINALNO', 'heg_max']] = pd.DataFrame(data['HEIGHT'].groupby(data['TERMINALNO']).max()).reset_index()
feature[['TERMINALNO', 'heg_min']] = pd.DataFrame(data['HEIGHT'].groupby(data['TERMINALNO']).min()).reset_index()
feature[['TERMINALNO', 'heg_mean']] = pd.DataFrame(data['HEIGHT'].groupby(data['TERMINALNO']).mean()).reset_index()
feature[['TERMINALNO', 'heg_50']] = pd.DataFrame(
    data['HEIGHT'].groupby(data['TERMINALNO']).quantile()).reset_index()
del train_data, test_data, data
gc.collect()

# 5.sp_max sp_mean sp_50
train_data = pd.read_csv(path_train, usecols=['TERMINALNO', 'SPEED'])
test_data = pd.read_csv(path_test, usecols=['TERMINALNO', 'SPEED'])

test_data['TERMINALNO'] = test_data['TERMINALNO'] + TRAIN_ID_MAX
data = pd.concat([train_data, test_data])
data.drop_duplicates(inplace=True)
data.fillna(0.0, inplace=True)

feature[['TERMINALNO', 'sp_max']] = pd.DataFrame(data['SPEED'].groupby(data['TERMINALNO']).max()).reset_index()
feature[['TERMINALNO', 'sp_mean']] = pd.DataFrame(data['SPEED'].groupby(data['TERMINALNO']).mean()).reset_index()
feature[['TERMINALNO', 'sp_50']] = pd.DataFrame(data['SPEED'].groupby(data['TERMINALNO']).quantile()).reset_index()
del train_data, test_data, data
gc.collect()

# 7.dis,avg_dis
train_data = pd.read_csv(path_train, usecols=['TERMINALNO', 'TIME', 'LONGITUDE', 'LATITUDE'])
test_data = pd.read_csv(path_test, usecols=['TERMINALNO', 'TIME', 'LONGITUDE', 'LATITUDE'])

test_data['TERMINALNO'] = test_data['TERMINALNO'] + TRAIN_ID_MAX
data = pd.concat([train_data, test_data])
data.drop_duplicates(inplace=True)
data.fillna(0.0, inplace=True)

# 每个用户按时间排序
data.sort_values(by=['TERMINALNO', 'TIME'], inplace=True)
# 计算经纬度差(未分Trip)
data['difflat'] = data.groupby(['TERMINALNO'])['LATITUDE'].diff()
data['difflon'] = data.groupby(['TERMINALNO'])['LONGITUDE'].diff()
# 对每个用户的第一个经纬度差置0
data.fillna(0.0, inplace=True)
# 计算单个距离
data['dis2'] = data['difflat'] ** 2 + data['difflon'] ** 2
data['dis'] = data['dis2'].apply(sqrt)
feature[['TERMINALNO', 'dis']] = pd.DataFrame(data['dis'].groupby(data['TERMINALNO']).sum()).reset_index()
feature['avg_dis'] = feature['dis'] / feature['trip_max']

del train_data, test_data, data
gc.collect()


96

In [3]:
# 8.dri_time ave_dri_time dri_time_trip_max
train_data = pd.read_csv(path_train, usecols=['TERMINALNO', 'TIME', 'TRIP_ID'])
test_data = pd.read_csv(path_test, usecols=['TERMINALNO', 'TIME', 'TRIP_ID'])

test_data['TERMINALNO'] = test_data['TERMINALNO'] + TRAIN_ID_MAX
data = pd.concat([train_data, test_data])
data.drop_duplicates(subset=['TERMINALNO', 'TIME'], inplace=True)
data.fillna(0.0, inplace=True)

# 按 TERMINALNO 和 time 排序
data.sort_values(['TERMINALNO', 'TIME'], inplace=True)
data['diff_time'] = data.groupby(['TERMINALNO'])['TIME'].diff()
data.fillna(0.0, inplace=True)

data['diff_time'] = data['diff_time'].apply(lambda x: x / 60)
data['diff_time'] = data['diff_time'].apply(f)
dri_time_sum = pd.DataFrame()
dri_time_sum[['TERMINALNO', 'dri_time']] = data['diff_time'].groupby(data['TERMINALNO']).sum().reset_index()[
        ['TERMINALNO', 'diff_time']]
feature = pd.merge(feature, dri_time_sum, how='left', on='TERMINALNO')
    # 6.平均时长
feature['ave_dri_time'] = feature['dri_time'] / feature['trip_max']

del train_data, test_data, data, dri_time_sum
gc.collect()


138

In [4]:
feature.columns

Index(['TERMINALNO', 'trip_max', 'lon_max', 'lon_min', 'lon_50', 'lat_max',
       'lat_min', 'lat_50', 'heg_max', 'heg_min', 'heg_mean', 'heg_50',
       'sp_max', 'sp_mean', 'sp_50', 'dis', 'avg_dis', 'dri_time',
       'ave_dri_time'],
      dtype='object')

In [5]:
# 归一化
feature['trip_max'] = feature['trip_max'].apply(
    lambda x: (x - feature['trip_max'].min()) / (feature['trip_max'].max() - feature['trip_max'].min()))
feature['lon_max'] = feature['lon_max'].apply(
    lambda x: (x - feature['lon_max'].min()) / (feature['lon_max'].max() - feature['lon_max'].min()))
feature['lon_min'] = feature['lon_min'].apply(
    lambda x: (x - feature['lon_min'].min()) / (feature['lon_min'].max() - feature['lon_min'].min()))
feature['lon_50'] = feature['lon_50'].apply(
    lambda x: (x - feature['lon_50'].min()) / (feature['lon_50'].max() - feature['lon_50'].min()))
feature['lat_min'] = feature['lat_min'].apply(
    lambda x: (x - feature['lat_min'].min()) / (feature['lat_min'].max() - feature['lat_min'].min()))
feature['lat_max'] = feature['lat_max'].apply(
    lambda x: (x - feature['lat_max'].min()) / (feature['lat_max'].max() - feature['lat_max'].min()))
feature['lat_50'] = feature['lat_50'].apply(
    lambda x: (x - feature['lat_50'].min()) / (feature['lat_50'].max() - feature['lat_50'].min()))
feature['heg_min'] = feature['heg_min'].apply(
    lambda x: (x - feature['heg_min'].min()) / (feature['heg_min'].max() - feature['heg_min'].min()))
feature['heg_max'] = feature['heg_max'].apply(
    lambda x: (x - feature['heg_max'].min()) / (feature['heg_max'].max() - feature['heg_max'].min()))
feature['heg_50'] = feature['heg_50'].apply(
    lambda x: (x - feature['heg_50'].min()) / (feature['heg_50'].max() - feature['heg_50'].min()))
feature['heg_mean'] = feature['heg_mean'].apply(
    lambda x: (x - feature['heg_mean'].min()) / (feature['heg_mean'].max() - feature['heg_mean'].min()))
feature['sp_50'] = feature['sp_50'].apply(
    lambda x: (x - feature['sp_50'].min()) / (feature['sp_50'].max() - feature['sp_50'].min()))
feature['sp_max'] = feature['sp_max'].apply(
    lambda x: (x - feature['sp_max'].min()) / (feature['sp_max'].max() - feature['sp_max'].min()))
feature['sp_mean'] = feature['sp_mean'].apply(
    lambda x: (x - feature['sp_mean'].min()) / (feature['sp_mean'].max() - feature['sp_mean'].min()))
feature['ave_dri_time'] = feature['ave_dri_time'].apply(
    lambda x: (x - feature['ave_dri_time'].min()) / (feature['ave_dri_time'].max() - feature['ave_dri_time'].min()))
feature['dri_time'] = feature['dri_time'].apply(
    lambda x: (x - feature['dri_time'].min()) / (feature['dri_time'].max() - feature['dri_time'].min()))
feature['dis'] = feature['dis'].apply(
    lambda x: (x - feature['dis'].min()) / (feature['dis'].max() - feature['dis'].min()))
# feature['dri_time_trip_max'] = feature['dri_time_trip_max'].apply(
#     lambda x: (x - feature['dri_time_trip_max'].min()) / (
#                 feature['dri_time_trip_max'].max() - feature['dri_time_trip_max'].min()))
feature['avg_dis'] = feature['avg_dis'].apply(
    lambda x: (x - feature['avg_dis'].min()) / (feature['avg_dis'].max() - feature['avg_dis'].min()))
print("data normalization..")
print("Feature End. feature shape:" + str(feature.shape))
print("generate train & test set")

# train_Y
train_Y = pd.read_csv(path_train, usecols=['TERMINALNO', 'Y'])
train_Y.drop_duplicates(inplace=True)
# Y值变换
train_Y.loc[:, 'Y'][train_Y['Y'] <= 0] = 0.00001
import numpy as np
from scipy import stats
Y_arrary = np.array(train_Y['Y'])
y, _ = stats.boxcox(Y_arrary)
# for i in range(len(y)):
train_Y.loc[:, 'Y'] = y
del Y_arrary, y, _
gc.collect()

feature = pd.merge(feature, train_Y, how='left', on='TERMINALNO')
train = feature[0:len(train_Y)]
test = feature[len(train_Y):]
train['Y'] = train['Y'].apply(lambda x: ((x - train['Y'].min()) / (train['Y'].max() - train['Y'].min())))

# 训练集和验证集划分
train_train, train_val = train_test_split(train, test_size=0.2, random_state=42)
print("train_train_shape:" + str(train_train.shape) + "  train_val_shape:" + str(train_val.shape))
print("model training")

data normalization..
Feature End. feature shape:(200, 19)
generate train & test set
train_train_shape:(80, 20)  train_val_shape:(20, 20)
model training


In [6]:
feature

Unnamed: 0,TERMINALNO,trip_max,lon_max,lon_min,lon_50,lat_max,lat_min,lat_50,heg_max,heg_min,heg_mean,heg_50,sp_max,sp_mean,sp_50,dis,avg_dis,dri_time,ave_dri_time,Y
0,1,0.198198,0.936877,0.947739,0.909634,9.769472e-01,0.879034,0.883942,0.064576,0.188562,0.016110,0.014583,0.393851,0.491509,0.471546,2.388804e-01,5.178416e-02,0.118051,0.012199,-3840.658222
1,2,0.292793,0.748756,0.720134,0.670771,3.836709e-01,0.060535,0.038049,0.153262,0.113481,0.024449,0.009298,0.491654,0.479332,0.479058,7.993322e-01,9.694052e-02,0.455965,0.041473,-3840.658222
2,3,0.162162,0.671824,0.709979,0.660859,2.934237e-01,0.040572,0.275323,0.035725,0.186029,0.014727,0.013657,0.178917,0.184435,0.203981,6.459360e-01,1.471148e-01,0.518426,0.110012,-3840.658222
3,4,0.436937,0.745451,0.800892,0.753330,5.330470e-01,0.536181,0.541595,0.032831,0.183405,0.010381,0.010138,0.409370,0.126406,0.133513,4.565487e-02,5.276907e-03,0.465334,0.019495,-3840.658222
4,5,0.378378,0.698736,0.740545,0.692716,4.022246e-01,0.403611,0.386922,0.033364,0.183133,0.009600,0.008754,1.000000,0.172250,0.107579,2.051479e-01,2.122497e-02,0.537789,0.033813,-3840.658222
5,6,0.175676,0.842218,0.889649,0.847352,4.242855e-01,0.437246,0.419696,0.037098,0.180765,0.002494,0.001967,0.299854,0.337297,0.362088,3.628698e-02,2.105469e-02,0.213616,0.036920,-3840.658222
6,7,0.396396,0.538413,0.558296,0.549480,3.871290e-01,0.358865,0.393787,0.125709,0.309892,0.088512,0.084166,0.478770,0.236241,0.194502,5.343469e-01,4.775609e-02,0.547158,0.031890,-3840.658222
7,8,0.261261,0.868658,0.914778,0.869371,7.968901e-01,0.779040,0.774048,0.054044,0.184286,0.012118,0.010094,0.684920,0.243443,0.141382,7.862374e-02,1.747027e-02,0.434104,0.046695,0.651429
8,9,0.036036,0.847942,0.881225,0.846490,4.295598e-01,0.395658,0.404035,0.008278,0.181535,0.001143,0.000698,0.478770,0.487870,0.467612,7.871606e-02,1.524815e-01,0.101187,0.158553,-3840.658222
9,10,0.252252,0.493964,0.545010,0.503803,3.272659e-01,0.350823,0.335510,0.116244,0.318959,0.107351,0.110066,0.380674,0.173278,0.138878,3.348304e-02,1.237222e-02,0.367895,0.039763,-3840.658222


In [18]:
# 调参
from sklearn.grid_search import GridSearchCV
tune_params = {
    'max_depth':[8,16],
    'num_leaves':[31,63],
    'learning_rate':[0.001,0.002],
    'n_estimators':[300,500]
}
lgbmodel = lgb.LGBMRegressor(
#     boosting_type='gbdt',
#     objective='regression',
#     num_leaves=63,
#     max_depth=8,
#     n_estimators=1000,
#     learning_rate=0.05,
    # n_jobs=20,
    random_state=42
)
gsearch = GridSearchCV(lgbmodel,param_grid = tune_params,cv=5)

In [19]:
gsearch.fit(train[use_feature_list],train['Y'])

GridSearchCV(cv=5, error_score='raise',
       estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
       n_jobs=-1, num_leaves=31, objective=None, random_state=42,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': [8, 16], 'num_leaves': [31, 63], 'learning_rate': [0.001, 0.002], 'n_estimators': [300, 500]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [36]:
test['Pred'] = gsearch.predict(test[use_feature_list])

In [38]:
test['Pred']

100    0.098346
101    0.097450
102    0.097221
103    0.138700
104    0.094777
105    0.109273
106    0.164359
107    0.138700
108    0.109273
109    0.164359
110    0.166745
111    0.136314
112    0.166267
113    0.098346
114    0.097830
115    0.166745
116    0.138700
117    0.166745
118    0.121303
119    0.111658
120    0.095961
121    0.166267
122    0.123688
123    0.166745
124    0.097869
125    0.164359
126    0.109273
127    0.111181
128    0.121303
129    0.097869
         ...   
170    0.164359
171    0.123688
172    0.164359
173    0.166267
174    0.136314
175    0.166267
176    0.166267
177    0.166745
178    0.166267
179    0.166745
180    0.121303
181    0.164359
182    0.095961
183    0.166745
184    0.136314
185    0.109273
186    0.138700
187    0.166745
188    0.094680
189    0.166267
190    0.109273
191    0.095961
192    0.138222
193    0.111658
194    0.121303
195    0.121303
196    0.138700
197    0.095961
198    0.109273
199    0.097162
Name: Pred, Length: 100,

In [20]:
gsearch.best_estimator_

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.001, max_depth=8, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=300,
       n_jobs=-1, num_leaves=31, objective=None, random_state=42,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)

In [29]:
bestparams = gsearch.best_estimator_.get_params()
print(bestparams)

{'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'learning_rate': 0.001, 'max_depth': 8, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 300, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': 42, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': True, 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 1}


In [32]:
bestmodel = lgb.LGBMRegressor(
    boosting_type = 'gbdt',
    objective = 'regression',
    num_leaves =bestparams['num_leaves'],
    max_depth=bestparams['max_depth'],
    n_estimators = bestparams['n_estimators'],
    learning_rate = bestparams['learning_rate'],
    random_state = bestparams['random_state']
)

In [34]:
bestmodel.fit(X=train[use_feature_list],y=train['Y'])

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.001, max_depth=8, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=300,
       n_jobs=-1, num_leaves=31, objective='regression', random_state=42,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)

In [35]:
fea_imp = pd.Series(bestmodel.feature_importances_,use_feature_list).sort_values(ascending=False)
print(fea_imp)

heg_min         289
dis             211
avg_dis         196
sp_50            57
trip_max         36
lat_50           22
sp_mean           0
sp_max            0
heg_50            0
heg_mean          0
ave_dri_time      0
heg_max           0
lat_min           0
lat_max           0
lon_50            0
lon_min           0
lon_max           0
dri_time          0
dtype: int64
