- competition/dataset : [https://www.kaggle.com/c/nyc-taxi-trip-duration](https://www.kaggle.com/c/nyc-taxi-trip-duration)
- date : 2021/02/26
- original : [https://www.kaggle.com/danijelk/beat-the-benchmark](https://www.kaggle.com/danijelk/beat-the-benchmark)

## Beat the benchmark!

**✏ 필사 1회** 

In [30]:
# libraries
import numpy as np
import pandas as pd
import datetime as dt
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

In [31]:
# read data
print('Read data...')
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

Read data...


In [32]:
# transform character to numeric
le = LabelEncoder()
le.fit(train['store_and_fwd_flag'])
train['store_and_fwd_flag'] = le.transform(train['store_and_fwd_flag'])
test['store_and_fwd_flag'] = le.transform(test['store_and_fwd_flag'])

train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'])
test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'])

In [33]:
# new feature
print('Create features...')
# date features
train['month'] = train['pickup_datetime'].dt.month
train['day'] = train['pickup_datetime'].dt.day
train['weekday'] = train['pickup_datetime'].dt.weekday
train['hour'] = train['pickup_datetime'].dt.hour
train['minute'] = train['pickup_datetime'].dt.minute

test['month'] = test['pickup_datetime'].dt.month
test['day'] = test['pickup_datetime'].dt.day
test['weekday'] = test['pickup_datetime'].dt.weekday
test['hour'] = test['pickup_datetime'].dt.hour
test['minute'] = test['pickup_datetime'].dt.minute

Create features...


In [34]:
# distance features
train['dist_long'] = train['pickup_longitude'] - train['dropoff_longitude']
train['dist_lat'] = train['pickup_latitude'] - train['dropoff_latitude']

test['dist_long'] = test['pickup_longitude'] - test['dropoff_longitude']
test['dist_lat'] = test['pickup_latitude'] - test['dropoff_latitude']

train['dist'] = np.sqrt(np.square(train['dist_long']) + np.square(train['dist_lat']))
test['dist'] = np.sqrt(np.square(test['dist_long']) + np.square(test['dist_lat']))

In [35]:
# spatial features (count and speed)
train['pickup_longitude_bin'] = np.round(train['pickup_longitude'], 2)
train['pickup_latitude_bin'] = np.round(train['pickup_longitude'], 2)
train['dropoff_longitude_bin'] = np.round(train['dropoff_longitude'], 2)
train['dropoff_latitude_bin'] = np.round(train['dropoff_latitude'], 2)

test['pickup_longitude_bin'] = np.round(test['pickup_longitude'], 2)
test['pickup_latitude_bin'] = np.round(test['pickup_longitude'], 2)
test['dropoff_longitude_bin'] = np.round(test['dropoff_longitude'], 2)
test['dropoff_latitude_bin'] = np.round(test['dropoff_latitude'], 2)

In [36]:
# count features
p_cols = ['pickup_longitude_bin', 'pickup_latitude_bin']
d_cols = ['dropoff_longitude_bin', 'dropoff_latitude_bin']
a = pd.concat([train, test]).groupby(p_cols).size().reset_index()
b = pd.concat([train, test]).groupby(d_cols).size().reset_index()

train = pd.merge(train, a, on=p_cols, how='left')
test = pd.merge(test, a, on=p_cols, how='left')

train = pd.merge(train, b, on=d_cols, how='left')
test = pd.merge(test, b, on=d_cols, how='left')

In [37]:
# speed features
train['speed'] = 100000 * train['dist'] / train['trip_duration']

a = train[['speed', 'pickup_longitude_bin', 'pickup_latitude_bin']].groupby(p_cols).mean().reset_index().rename(columns={'speed':'ave_speed'})
b = train[['speed', 'dropoff_longitude_bin', 'dropoff_latitude_bin']].groupby(d_cols).mean().reset_index().rename(columns={'speed':'ave_speed'})

train = pd.merge(train, a, on=p_cols, how='left')
test = pd.merge(test, a, on=p_cols, how='left')

train = pd.merge(train, b, on=d_cols, how='left')
test = pd.merge(test, b, on=d_cols, how='left')

In [38]:
# drop bins
train.drop(['speed', 'pickup_longitude_bin', 'pickup_latitude_bin', 'dropoff_longitude_bin', 'dropoff_latitude_bin'], axis=1, inplace=True)
test.drop(['pickup_longitude_bin', 'pickup_latitude_bin', 'dropoff_longitude_bin', 'dropoff_latitude_bin'], axis=1, inplace=True)

In [39]:
# weather data
weather = pd.read_csv('data/KNYC_Metars.csv')
weather.head(3)

Unnamed: 0,Time,Temp.,Windchill,Heat Index,Humidity,Pressure,Dew Point,Visibility,Wind Dir,Wind Speed,Gust Speed,Precip,Events,Conditions
0,2015-12-31 02:00:00,7.8,7.1,,0.89,1017.0,6.1,8.0,NNE,5.6,0.0,0.8,,Overcast
1,2015-12-31 03:00:00,7.2,5.9,,0.9,1016.5,5.6,12.9,Variable,7.4,0.0,0.3,,Overcast
2,2015-12-31 04:00:00,7.2,,,0.9,1016.7,5.6,12.9,Calm,0.0,0.0,0.0,,Overcast


In [40]:
# create features
weather['Time'] = pd.to_datetime(weather['Time'])
weather['year'] = weather['Time'].dt.year
weather['month'] = weather['Time'].dt.month
weather['day'] = weather['Time'].dt.day
weather['hour'] = weather['Time'].dt.hour
weather = weather[weather['year'] == 2016]

train = pd.merge(train, weather[['Temp.', 'month', 'day', 'hour']], on=['month', 'day', 'hour'], how='left')
test = pd.merge(test, weather[['Temp.', 'month', 'day', 'hour']], on=['month', 'day', 'hour'], how='left')

In [41]:
# train/test features, y, id
xtrain = train.drop(['id', 'pickup_datetime', 'dropoff_datetime', 'trip_duration'], axis=1)
xtest = test.drop(['id', 'pickup_datetime'], axis=1)
ytrain = train['trip_duration'].values
id_train = train['id'].values
id_test = test['id'].values
del(train, test)

In [42]:
# xgb parameters
params = {
    'booster': 'gbtree',
    'objective':'reg:linear',
    'learning_rate':0.1,
    'max_depth':14,
    'subsample':0.8,
    'colsample_bytree':0.7,
    'colsample_bylevel':0.7,
    'silent':1
}

# number of rounds
nrounds = 200

In [43]:
# train model
print('Train model...')
dtrain = xgb.DMatrix(xtrain, np.log(ytrain + 1))
gbm = xgb.train(params, dtrain, num_boost_round=nrounds)

Train model...
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [44]:
# test predictions
pred_test = np.exp(gbm.predict(xgb.DMatrix(xtest))) - 1

In [45]:
# create submission
df = pd.DataFrame({'id':id_test, 'trip_duration':pred_test})
df = df.set_index('id')
df.to_csv('data/submission_4_sub_bench.csv', index=True)