# New York City Taxi Trip Duration
- https://www.kaggle.com/c/nyc-taxi-trip-duration
- 데이터: https://www.kaggle.com/c/nyc-taxi-trip-duration/data
    - 뉴욕 택시 승하차 위경도 및 시간이 주요 데이터
    - target: trip duration (in seconds)
- 평가: Root Mean Squared Logarithmic Error.

# 목표
- 커널 따라하기: https://www.kaggle.com/gaborfodor/from-eda-to-the-top-lb-0-367
- submission score: 0.44215

# 배운 점
- clustering를 활용한 feature engineering
    - target 값을 직접적으로 이용한 데이터와 같이 traing set에서만 얻을 수 있는 데이터를 clustering를 통해 test set에 적용.
- 데이터가 부족하면 외부 데이터를 끌어와 보강
    - https://www.kaggle.com/oscarleo/new-york-city-taxi-with-osrm
        - OSRM (Open Source Routing Machine): http://project-osrm.org/
- lightgbm
    - https://lightgbm.readthedocs.io/en/latest/Experiments.html#

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from datetime import timedelta
import datetime as dt
import matplotlib.pyplot as plt
# plt.rcParams['figure.figsize'] = [16, 10]
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans

In [2]:
np.random.seed(1987)
t0 = dt.datetime.now()
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [3]:
train['pickup_datetime'] = pd.to_datetime(train.pickup_datetime)
test['pickup_datetime'] = pd.to_datetime(test.pickup_datetime)
train.loc[:, 'pickup_date'] = train['pickup_datetime'].dt.date
test.loc[:, 'pickup_date'] = test['pickup_datetime'].dt.date
train['dropoff_datetime'] = pd.to_datetime(train.dropoff_datetime)
train['store_and_fwd_flag'] = train.store_and_fwd_flag.map({'Y':1, 'N':0})
test['store_and_fwd_flag'] = test.store_and_fwd_flag.map({'Y':1, 'N':0})
train['check_trip_duration'] = (train['dropoff_datetime'] - train['pickup_datetime']).map(lambda x: x.total_seconds())
duration_difference = train[np.abs(train['check_trip_duration']  - train['trip_duration']) > 1]

In [12]:
train['log_trip_duration'] = np.log(train['trip_duration'].values + 1)

In [4]:
coords = np.vstack((train[['pickup_latitude', 'pickup_longitude']],
                    train[['dropoff_latitude', 'dropoff_longitude']],
                    test[['pickup_latitude', 'pickup_longitude']],
                     test[['dropoff_latitude', 'dropoff_longitude']]))

pca = PCA().fit(coords)
train['pickup_pca0'] = pca.transform(train[['pickup_latitude', 'pickup_longitude']])[:, 0]
train['pickup_pca1'] = pca.transform(train[['pickup_latitude', 'pickup_longitude']])[:, 1]
train['dropoff_pca0'] = pca.transform(train[['dropoff_latitude', 'dropoff_longitude']])[:, 0]
train['dropoff_pca1'] = pca.transform(train[['dropoff_latitude', 'dropoff_longitude']])[:, 1]
test['pickup_pca0'] = pca.transform(test[['pickup_latitude', 'pickup_longitude']])[:, 0]
test['pickup_pca1'] = pca.transform(test[['pickup_latitude', 'pickup_longitude']])[:, 1]
test['dropoff_pca0'] = pca.transform(test[['dropoff_latitude', 'dropoff_longitude']])[:, 0]
test['dropoff_pca1'] = pca.transform(test[['dropoff_latitude', 'dropoff_longitude']])[:, 1]

In [5]:
def haversine_array(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371  # in km
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h

def dummy_manhattan_distance(lat1, lng1, lat2, lng2):
    a = haversine_array(lat1, lng1, lat1, lng2) # haversine distance w.r.t. lng
    b = haversine_array(lat1, lng1, lat2, lng1) # haversine distance w.r.t. lat
    return a + b

def bearing_array(lat1, lng1, lat2, lng2):
    AVG_EARTH_RADIUS = 6371  # in km
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))

train.loc[:, 'distance_haversine'] = haversine_array(train['pickup_latitude'], train['pickup_longitude'], train['dropoff_latitude'], train['dropoff_longitude'])
train.loc[:, 'distance_dummy_manhattan'] = dummy_manhattan_distance(train['pickup_latitude'], train['pickup_longitude'], train['dropoff_latitude'], train['dropoff_longitude'])
train.loc[:, 'direction'] = bearing_array(train['pickup_latitude'], train['pickup_longitude'], train['dropoff_latitude'], train['dropoff_longitude'])
train.loc[:, 'pca_manhattan'] = np.abs(train['dropoff_pca1'] - train['pickup_pca1']) + np.abs(train['dropoff_pca0'] - train['pickup_pca0'])

test.loc[:, 'distance_haversine'] = haversine_array(test['pickup_latitude'], test['pickup_longitude'], test['dropoff_latitude'], test['dropoff_longitude'])
test.loc[:, 'distance_dummy_manhattan'] = dummy_manhattan_distance(test['pickup_latitude'], test['pickup_longitude'], test['dropoff_latitude'], test['dropoff_longitude'])
test.loc[:, 'direction'] = bearing_array(test['pickup_latitude'], test['pickup_longitude'], test['dropoff_latitude'], test['dropoff_longitude'])
test.loc[:, 'pca_manhattan'] = np.abs(test['dropoff_pca1'] - test['pickup_pca1']) + np.abs(test['dropoff_pca0'] - test['pickup_pca0'])

train.loc[:, 'center_latitude'] = (train['pickup_latitude'] + train['dropoff_latitude']) / 2
train.loc[:, 'center_longitude'] = (train['pickup_longitude'] + train['dropoff_longitude']) / 2

test.loc[:, 'center_latitude'] = (test['pickup_latitude'] + test['dropoff_latitude']) / 2
test.loc[:, 'center_longitude'] = (test['pickup_longitude'] + test['dropoff_longitude']) / 2

In [6]:
train.loc[:, 'pickup_weekday'] = train['pickup_datetime'].dt.weekday # range: 0(Mon)-6(Sun)
train.loc[:, 'pickup_hour_weekofyear'] = train['pickup_datetime'].dt.weekofyear
train.loc[:, 'pickup_hour'] = train['pickup_datetime'].dt.hour
train.loc[:, 'pickup_minute'] = train['pickup_datetime'].dt.minute
train.loc[:, 'pickup_dt'] = (train['pickup_datetime'] - train['pickup_datetime'].min()).dt.total_seconds()
train.loc[:, 'pickup_week_hour'] = train['pickup_weekday'] * 24 + train['pickup_hour'] # range: 0-7*24

test.loc[:, 'pickup_weekday'] = test['pickup_datetime'].dt.weekday
test.loc[:, 'pickup_hour_weekofyear'] = test['pickup_datetime'].dt.weekofyear
test.loc[:, 'pickup_hour'] = test['pickup_datetime'].dt.hour
test.loc[:, 'pickup_minute'] = test['pickup_datetime'].dt.minute
test.loc[:, 'pickup_dt'] = (test['pickup_datetime'] - train['pickup_datetime'].min()).dt.total_seconds()
test.loc[:, 'pickup_week_hour'] = test['pickup_weekday'] * 24 + test['pickup_hour']

In [7]:
train.loc[:, 'avg_speed_h'] = 1000 * train['distance_haversine'] / train['trip_duration']
train.loc[:, 'avg_speed_m'] = 1000 * train['distance_dummy_manhattan'] / train['trip_duration']

In [8]:
train.loc[:, 'pickup_lat_bin'] = np.round(train['pickup_latitude'], 3)
train.loc[:, 'pickup_long_bin'] = np.round(train['pickup_longitude'], 3)
# Average speed for regions
gby_cols = ['pickup_lat_bin', 'pickup_long_bin']
coord_speed = train.groupby(gby_cols).mean()[['avg_speed_h']].reset_index()
coord_count = train.groupby(gby_cols).count()[['id']].reset_index()
coord_stats = pd.merge(coord_speed, coord_count, on=gby_cols)
coord_stats = coord_stats[coord_stats['id'] > 100]

train.loc[:, 'pickup_lat_bin'] = np.round(train['pickup_latitude'], 2)
train.loc[:, 'pickup_long_bin'] = np.round(train['pickup_longitude'], 2)
train.loc[:, 'center_lat_bin'] = np.round(train['center_latitude'], 2)
train.loc[:, 'center_long_bin'] = np.round(train['center_longitude'], 2)
train.loc[:, 'pickup_dt_bin'] = (train['pickup_dt'] // (3 * 3600))
test.loc[:, 'pickup_lat_bin'] = np.round(test['pickup_latitude'], 2)
test.loc[:, 'pickup_long_bin'] = np.round(test['pickup_longitude'], 2)
test.loc[:, 'center_lat_bin'] = np.round(test['center_latitude'], 2)
test.loc[:, 'center_long_bin'] = np.round(test['center_longitude'], 2)
test.loc[:, 'pickup_dt_bin'] = (test['pickup_dt'] // (3 * 3600))

In [9]:
sample_ind = np.random.permutation(len(coords))[:500000]
kmeans = MiniBatchKMeans(n_clusters=100, batch_size=10000).fit(coords[sample_ind])

`coords`는 train data, test data 합집합, pickup points, dropoff points 합집합

In [10]:
train.loc[:, 'pickup_cluster'] = kmeans.predict(train[['pickup_latitude', 'pickup_longitude']])
train.loc[:, 'dropoff_cluster'] = kmeans.predict(train[['dropoff_latitude', 'dropoff_longitude']])
test.loc[:, 'pickup_cluster'] = kmeans.predict(test[['pickup_latitude', 'pickup_longitude']])
test.loc[:, 'dropoff_cluster'] = kmeans.predict(test[['dropoff_latitude', 'dropoff_longitude']])

In [13]:
for gby_col in ['pickup_hour', 'pickup_date', 'pickup_dt_bin',
               'pickup_week_hour', 'pickup_cluster', 'dropoff_cluster']:
    gby = train.groupby(gby_col).mean()[['avg_speed_h', 'avg_speed_m', 'log_trip_duration']]
    gby.columns = ['%s_gby_%s' % (col, gby_col) for col in gby.columns]
    train = pd.merge(train, gby, how='left', left_on=gby_col, right_index=True)
    test = pd.merge(test, gby, how='left', left_on=gby_col, right_index=True)

# multi-level groups
for gby_cols in [['center_lat_bin', 'center_long_bin'],
                 ['pickup_hour', 'center_lat_bin', 'center_long_bin'],
                 ['pickup_hour', 'pickup_cluster'], 
                 ['pickup_hour', 'dropoff_cluster'],
                 ['pickup_cluster', 'dropoff_cluster']]:
    coord_speed = train.groupby(gby_cols).mean()[['avg_speed_h']].reset_index()
    coord_count = train.groupby(gby_cols).count()[['id']].reset_index()
    coord_stats = pd.merge(coord_speed, coord_count, on=gby_cols)
    coord_stats = coord_stats[coord_stats['id'] > 100]
    coord_stats.columns = gby_cols + ['avg_speed_h_%s' % '_'.join(gby_cols), 'cnt_%s' %  '_'.join(gby_cols)]
    train = pd.merge(train, coord_stats, how='left', on=gby_cols)
    test = pd.merge(test, coord_stats, how='left', on=gby_cols)

In [14]:
group_freq = '60min'
df_all = pd.concat((train, test))[['id', 'pickup_datetime', 'pickup_cluster', 'dropoff_cluster']]
train.loc[:, 'pickup_datetime_group'] = train['pickup_datetime'].dt.round(group_freq)
test.loc[:, 'pickup_datetime_group'] = test['pickup_datetime'].dt.round(group_freq)

# Count trips over 60min
df_counts = df_all.set_index('pickup_datetime')[['id']].sort_index()
df_counts['count_60min'] = df_counts.isnull().rolling(group_freq).count()['id']
train = train.merge(df_counts, on='id', how='left')
test = test.merge(df_counts, on='id', how='left')

# Count how many trips are going to each cluster over time
dropoff_counts = df_all \
    .set_index('pickup_datetime') \
    .groupby([pd.TimeGrouper(group_freq), 'dropoff_cluster']) \
    .agg({'id': 'count'}) \
    .reset_index().set_index('pickup_datetime') \
    .groupby('dropoff_cluster').rolling('240min').mean() \
    .drop('dropoff_cluster', axis=1) \
    .reset_index().set_index('pickup_datetime').shift(freq='-120min').reset_index() \
    .rename(columns={'pickup_datetime': 'pickup_datetime_group', 'id': 'dropoff_cluster_count'})

train['dropoff_cluster_count'] = train[['pickup_datetime_group', 'dropoff_cluster']].merge(dropoff_counts, on=['pickup_datetime_group', 'dropoff_cluster'], how='left')['dropoff_cluster_count'].fillna(0)
test['dropoff_cluster_count'] = test[['pickup_datetime_group', 'dropoff_cluster']].merge(dropoff_counts, on=['pickup_datetime_group', 'dropoff_cluster'], how='left')['dropoff_cluster_count'].fillna(0)

In [15]:
# Count how many trips are going from each cluster over time
df_all = pd.concat((train, test))[['id', 'pickup_datetime', 'pickup_cluster', 'dropoff_cluster']]
pickup_counts = df_all \
    .set_index('pickup_datetime') \
    .groupby([pd.TimeGrouper(group_freq), 'pickup_cluster']) \
    .agg({'id': 'count'}) \
    .reset_index().set_index('pickup_datetime') \
    .groupby('pickup_cluster').rolling('240min').mean() \
    .drop('pickup_cluster', axis=1) \
    .reset_index().set_index('pickup_datetime').shift(freq='-120min').reset_index() \
    .rename(columns={'pickup_datetime': 'pickup_datetime_group', 'id': 'pickup_cluster_count'})

train['pickup_cluster_count'] = train[['pickup_datetime_group', 'pickup_cluster']].merge(pickup_counts, on=['pickup_datetime_group', 'pickup_cluster'], how='left')['pickup_cluster_count'].fillna(0)
test['pickup_cluster_count'] = test[['pickup_datetime_group', 'pickup_cluster']].merge(pickup_counts, on=['pickup_datetime_group', 'pickup_cluster'], how='left')['pickup_cluster_count'].fillna(0)

In [16]:
fr1 = pd.read_csv('fastest_routes_train_part_1.csv',
                  usecols=['id', 'total_distance', 'total_travel_time',  'number_of_steps'])
fr2 = pd.read_csv('fastest_routes_train_part_2.csv',
                  usecols=['id', 'total_distance', 'total_travel_time', 'number_of_steps'])
test_street_info = pd.read_csv('fastest_routes_test.csv',
                               usecols=['id', 'total_distance', 'total_travel_time', 'number_of_steps'])
train_street_info = pd.concat((fr1, fr2))
train = train.merge(train_street_info, how='left', on='id')
test = test.merge(test_street_info, how='left', on='id')

In [18]:
feature_names = list(train.columns)
do_not_use_for_training = ['id', 'log_trip_duration', 'pickup_datetime', 'dropoff_datetime',
                           'trip_duration', 'check_trip_duration',
                           'pickup_date', 'avg_speed_h', 'avg_speed_m',
                           'pickup_lat_bin', 'pickup_long_bin',
                           'center_lat_bin', 'center_long_bin',
                           'pickup_dt_bin', 'pickup_datetime_group']
feature_names = [f for f in train.columns if f not in do_not_use_for_training]
y = np.log1p(train['trip_duration'])

In [19]:
feature_names

['vendor_id',
 'passenger_count',
 'pickup_longitude',
 'pickup_latitude',
 'dropoff_longitude',
 'dropoff_latitude',
 'store_and_fwd_flag',
 'pickup_pca0',
 'pickup_pca1',
 'dropoff_pca0',
 'dropoff_pca1',
 'distance_haversine',
 'distance_dummy_manhattan',
 'direction',
 'pca_manhattan',
 'center_latitude',
 'center_longitude',
 'pickup_weekday',
 'pickup_hour_weekofyear',
 'pickup_hour',
 'pickup_minute',
 'pickup_dt',
 'pickup_week_hour',
 'pickup_cluster',
 'dropoff_cluster',
 'avg_speed_h_gby_pickup_hour',
 'avg_speed_m_gby_pickup_hour',
 'log_trip_duration_gby_pickup_hour',
 'avg_speed_h_gby_pickup_date',
 'avg_speed_m_gby_pickup_date',
 'log_trip_duration_gby_pickup_date',
 'avg_speed_h_gby_pickup_dt_bin',
 'avg_speed_m_gby_pickup_dt_bin',
 'log_trip_duration_gby_pickup_dt_bin',
 'avg_speed_h_gby_pickup_week_hour',
 'avg_speed_m_gby_pickup_week_hour',
 'log_trip_duration_gby_pickup_week_hour',
 'avg_speed_h_gby_pickup_cluster',
 'avg_speed_m_gby_pickup_cluster',
 'log_trip_dura

In [20]:
feature_stats = pd.DataFrame({'feature': feature_names})
feature_stats.loc[:, 'train_mean'] = np.nanmean(train[feature_names], axis=0).round(4)
feature_stats.loc[:, 'test_mean'] = np.nanmean(test[feature_names], axis=0).round(4)
feature_stats.loc[:, 'train_std'] = np.nanstd(train[feature_names], axis=0).round(4)
feature_stats.loc[:, 'test_std'] = np.nanstd(test[feature_names], axis=0).round(4)
feature_stats.loc[:, 'train_nan'] = np.mean(np.isnan(train[feature_names]), axis=0).round(3)
feature_stats.loc[:, 'test_nan'] = np.mean(np.isnan(test[feature_names]), axis=0).round(3)
feature_stats.loc[:, 'train_test_mean_diff'] = np.abs(feature_stats['train_mean'] - feature_stats['test_mean']) / np.abs(feature_stats['train_std'] + feature_stats['test_std'])  * 2
feature_stats.loc[:, 'train_test_nan_diff'] = np.abs(feature_stats['train_nan'] - feature_stats['test_nan'])

`train_test_mean_diff` $:=$
$$
\frac {|\mu_{train} - \mu_{test}|} {{|\sigma_{train} + \sigma_{test}|} / {2}}
$$

In [21]:
Xtr, Xv, ytr, yv = train_test_split(train[feature_names], y, test_size=0.2, random_state=1987)

dtrain = lgb.Dataset(Xtr, label=ytr)
dvalid = lgb.Dataset(Xv, label=yv)
dtest = lgb.Dataset(test[feature_names])

In [22]:
model = lgb.LGBMRegressor(min_child_weight=50, learning_rate=0.3, colsample_bytree=0.3, max_depth=10,
                          subsample=0.8, reg_lambda=1, boosting_type='gbdt')

In [26]:
model = model.fit(Xtr, ytr, eval_set=[(Xv, yv)], verbose=False, eval_metric='rmse')

In [33]:
sorted(zip(model.feature_importances_, feature_names), reverse=True)

[(48, 'direction'),
 (32, 'distance_haversine'),
 (30, 'distance_dummy_manhattan'),
 (26, 'avg_speed_h_pickup_hour_dropoff_cluster'),
 (13, 'avg_speed_m_gby_pickup_dt_bin'),
 (12, 'avg_speed_h_gby_pickup_week_hour'),
 (12, 'avg_speed_h_center_lat_bin_center_long_bin'),
 (11, 'log_trip_duration_gby_pickup_cluster'),
 (10, 'pickup_pca1'),
 (10, 'log_trip_duration_gby_dropoff_cluster'),
 (9, 'log_trip_duration_gby_pickup_dt_bin'),
 (7, 'pickup_longitude'),
 (7, 'pca_manhattan'),
 (6, 'dropoff_pca0'),
 (6, 'avg_speed_h_pickup_hour_center_lat_bin_center_long_bin'),
 (5, 'pickup_pca0'),
 (5, 'dropoff_pca1'),
 (5, 'cnt_pickup_hour_center_lat_bin_center_long_bin'),
 (5, 'avg_speed_h_gby_pickup_dt_bin'),
 (4, 'dropoff_latitude'),
 (4, 'count_60min'),
 (4, 'cnt_center_lat_bin_center_long_bin'),
 (3, 'center_longitude'),
 (3, 'avg_speed_m_gby_pickup_week_hour'),
 (3, 'avg_speed_m_gby_pickup_date'),
 (3, 'avg_speed_m_gby_dropoff_cluster'),
 (3, 'avg_speed_h_gby_dropoff_cluster'),
 (2, 'pickup_week

In [27]:
ytest = model.predict(test[feature_names])
test['trip_duration'] = np.expm1(ytest)
test[['id', 'trip_duration']].to_csv('submission.csv.gz', index=False, compression='gzip')