# 範例 : 計程車費率預測
https://www.kaggle.com/c/new-york-city-taxi-fare-prediction

# [作業目標]
- 使用並觀察特徵組合, 在計程車費率預測競賽的影響

# [作業重點]
- 仿造範例並參考今日課程內容, 使用經緯度一圈的長度比的概念造出新特徵, 觀察有什麼影響 (In[6], Out[6])
- 只使用上面所造的這個新特徵, 觀察有什麼影響 (In[7], Out[7])

In [1]:
# 做完特徵工程前的所有準備
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

data_path = './data/'
df = pd.read_csv(data_path + 'taxi_data1.csv')

train_Y = df['fare_amount']
df = df.drop(['fare_amount'] , axis=1)
df

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2011-10-21 23:54:10 UTC,-73.990580,40.761071,-73.981128,40.758634,2
1,2015-02-03 10:42:03 UTC,-73.988403,40.723431,-73.989647,40.741695,1
2,2014-03-16 18:58:58 UTC,-74.015785,40.715110,-74.012029,40.707888,2
3,2009-06-13 16:10:54 UTC,-73.977322,40.787275,-73.958030,40.778838,3
4,2014-06-12 03:25:56 UTC,-73.989683,40.729717,-73.982490,40.761887,3
...,...,...,...,...,...,...
4995,2011-12-23 19:07:00 UTC,-73.994150,40.761397,-73.986840,40.759202,1
4996,2009-08-26 18:19:41 UTC,-73.986767,40.729937,-73.977022,40.726482,1
4997,2010-10-25 06:05:21 UTC,-73.984612,40.732297,-73.995004,40.750059,1
4998,2010-02-16 16:57:00 UTC,-73.958460,40.773415,-73.953445,40.782598,1


In [2]:
# 時間特徵分解方式:使用datetime
df['pickup_datetime'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S UTC'))
df['pickup_year'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%Y')).astype('int64')
df['pickup_month'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%m')).astype('int64')
df['pickup_day'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%d')).astype('int64')
df['pickup_hour'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%H')).astype('int64')
df['pickup_minute'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%M')).astype('int64')
df['pickup_second'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%S')).astype('int64')
df

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second
0,2011-10-21 23:54:10,-73.990580,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3
2,2014-03-16 18:58:58,-74.015785,40.715110,-74.012029,40.707888,2,2014,3,16,18,58,58
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.958030,40.778838,3,2009,6,13,16,10,54
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.982490,40.761887,3,2014,6,12,3,25,56
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,2011-12-23 19:07:00,-73.994150,40.761397,-73.986840,40.759202,1,2011,12,23,19,7,0
4996,2009-08-26 18:19:41,-73.986767,40.729937,-73.977022,40.726482,1,2009,8,26,18,19,41
4997,2010-10-25 06:05:21,-73.984612,40.732297,-73.995004,40.750059,1,2010,10,25,6,5,21
4998,2010-02-16 16:57:00,-73.958460,40.773415,-73.953445,40.782598,1,2010,2,16,16,57,0


In [3]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df = df.drop(['pickup_datetime'] , axis=1)
scaler = MinMaxScaler()
train_X = scaler.fit_transform(df)
Linear = LinearRegression()
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
GDBT = GradientBoostingRegressor()
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

Linear Reg Score : 0.02687687147563982
Gradient Boosting Reg Score : 0.7108405894451983


In [4]:
# 增加緯度差, 經度差, 座標距離等三個特徵
df['longitude_diff'] = df['dropoff_longitude'] - df['pickup_longitude']
df['latitude_diff'] = df['dropoff_latitude'] - df['pickup_latitude']
df['distance_2D'] = (df['longitude_diff']**2 + df['latitude_diff']**2)**0.5
df[['distance_2D', 'longitude_diff', 'latitude_diff', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']]

Unnamed: 0,distance_2D,longitude_diff,latitude_diff,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
0,0.009761,0.009452,-0.002437,-73.990580,40.761071,-73.981128,40.758634
1,0.018307,-0.001244,0.018265,-73.988403,40.723431,-73.989647,40.741695
2,0.008140,0.003756,-0.007222,-74.015785,40.715110,-74.012029,40.707888
3,0.021056,0.019292,-0.008437,-73.977322,40.787275,-73.958030,40.778838
4,0.032964,0.007193,0.032170,-73.989683,40.729717,-73.982490,40.761887
...,...,...,...,...,...,...,...
4995,0.007632,0.007310,-0.002195,-73.994150,40.761397,-73.986840,40.759202
4996,0.010339,0.009745,-0.003455,-73.986767,40.729937,-73.977022,40.726482
4997,0.020579,-0.010392,0.017762,-73.984612,40.732297,-73.995004,40.750059
4998,0.010463,0.005015,0.009183,-73.958460,40.773415,-73.953445,40.782598


In [5]:
# 結果 : 準確度上升
train_X = scaler.fit_transform(df)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

Linear Reg Score : 0.027280664460997483
Gradient Boosting Reg Score : 0.8048166231433429


# 作業1
* 參考今日教材，試著使用經緯度一圈的長度比這一概念，組合出一個新特徵，再觀察原特徵加上新特徵是否提升了正確率?

In [6]:
import math
"""
Your Code Here, set new character at df['distance_real']
"""
#Numerical 0.75756
df['distance_real'] = ((df['longitude_diff'] * 0.75756)**2 + df['latitude_diff']**2)**0.5
# floating
#df['distance_real'] = ((df['longitude_diff'] * df['pickup_latitude'].apply(lambda x: math.cos(x/180*math.pi)))**2 + df['latitude_diff']**2)**0.5

# 觀察結果 
train_X = scaler.fit_transform(df)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

Linear Reg Score : 0.025197426578447722
Gradient Boosting Reg Score : 0.8039895195622669


In [7]:
import math
"""
Your Code Here, set new character at df['distance_real']
"""
# Numerical 0.75756
#df['distance_real'] = ((df['longitude_diff'] * 0.75756)**2 + df['latitude_diff']**2)**0.5
# floating
df['distance_real'] = ((df['longitude_diff'] * df['pickup_latitude'].apply(lambda x: math.cos(x/180*math.pi)))**2 + df['latitude_diff']**2)**0.5

# 觀察結果 
train_X = scaler.fit_transform(df)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

Linear Reg Score : 0.23127647815433522
Gradient Boosting Reg Score : 0.8041912396445692


沒有太大變化

# 作業2
* 試著只使用新特徵估計目標值(忽略原特徵)，效果跟作業1的結果比較起來效果如何?

In [8]:
train_X = scaler.fit_transform(df[['distance_real']])
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

Linear Reg Score : 0.001286832162626661
Gradient Boosting Reg Score : 0.719693274136993


留下的資訊太少，無法像作業一那麼好，但是最原始的表格下去做的分數差不多，表示「真實距離」這個項目可能是最具代表性的重要資料。

In [9]:
##### Extra play

In [11]:
for c in df.columns:
    train_X = scaler.fit_transform(df[[c]])
    print(f'Pick up column {c}')
    print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
    print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')
    print()

Pick up column pickup_longitude
Linear Reg Score : -0.0014655675919949206
Gradient Boosting Reg Score : 0.20337859767587524

Pick up column pickup_latitude
Linear Reg Score : -0.001490825700251963
Gradient Boosting Reg Score : 0.15574475098496712

Pick up column dropoff_longitude
Linear Reg Score : -0.0010590556970415444
Gradient Boosting Reg Score : 0.25025545967061047

Pick up column dropoff_latitude
Linear Reg Score : -0.0011144924785762012
Gradient Boosting Reg Score : 0.15217362202127652

Pick up column passenger_count
Linear Reg Score : -0.0015958427192376279
Gradient Boosting Reg Score : -0.0031578484116243644

Pick up column pickup_year
Linear Reg Score : 0.013397635456442635
Gradient Boosting Reg Score : 0.01322849035444651

Pick up column pickup_month
Linear Reg Score : 0.0006131801665032021
Gradient Boosting Reg Score : -0.00018596710430980324

Pick up column pickup_day
Linear Reg Score : -0.0023842968112413932
Gradient Boosting Reg Score : -0.011226970633229616

Pick up col