In [1]:
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

data_path='C:/Users/User/Documents/'
df=pd.read_csv(data_path+'taxi_data1.csv')

train_y=df['fare_amount']
df=df.drop(['fare_amount'],axis=1)
df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2011-10-21 23:54:10 UTC,-73.99058,40.761071,-73.981128,40.758634,2
1,2015-02-03 10:42:03 UTC,-73.988403,40.723431,-73.989647,40.741695,1
2,2014-03-16 18:58:58 UTC,-74.015785,40.71511,-74.012029,40.707888,2
3,2009-06-13 16:10:54 UTC,-73.977322,40.787275,-73.95803,40.778838,3
4,2014-06-12 03:25:56 UTC,-73.989683,40.729717,-73.98249,40.761887,3


In [2]:
#時間特徵分解:使用datetime
df['pickup_datetime']=df['pickup_datetime'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d %H:%M:%S UTC'))
df['pickup_year']=df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x,'%Y')).astype('int64')
df['pickup_month']=df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x,'%m')).astype('int64')
df['pickup_day']=df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x,'%d')).astype('int64')
df['pickup_hour']=df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x,'%H')).astype('int64')
df['pickup_minute']=df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x,'%M')).astype('int64')
df['pickup_second']=df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x,'%S')).astype('int64')
df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56


In [8]:
# 將結果以線性回歸 / 梯度上升樹察看結果
df=df.drop(['pickup_datetime'],axis=1)
scaler=MinMaxScaler()
train_x=scaler.fit_transform(df)
Linear=LinearRegression()
GDBT=GradientBoostingRegressor()
print(f'linear reg score:{cross_val_score(Linear,train_x,train_y,cv=5).mean()}')
print(f'gradient boosting reg score:{cross_val_score(GDBT,train_x,train_y,cv=5).mean()}')

linear reg score:0.027396023418401704
gradient boosting reg score:0.8047557998845921


In [9]:
# 增加經緯度差 及座標距離等特徵
df['longitude_diff']=df['dropoff_longitude']-df['pickup_longitude']
df['latitude_diff']=df['dropoff_latitude']-df['pickup_latitude']
df['distance_2D']=(df['longitude_diff']**2+df['latitude_diff']**2)**0.5
df[['distance_2D','longitude_diff','latitude_diff','pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude']]

Unnamed: 0,distance_2D,longitude_diff,latitude_diff,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude
0,0.009761,0.009452,-0.002437,40.761071,-73.990580,40.758634,-73.981128
1,0.018307,-0.001244,0.018265,40.723431,-73.988403,40.741695,-73.989647
2,0.008140,0.003756,-0.007222,40.715110,-74.015785,40.707888,-74.012029
3,0.021056,0.019292,-0.008437,40.787275,-73.977322,40.778838,-73.958030
4,0.032964,0.007193,0.032170,40.729717,-73.989683,40.761887,-73.982490
...,...,...,...,...,...,...,...
4995,0.007632,0.007310,-0.002195,40.761397,-73.994150,40.759202,-73.986840
4996,0.010339,0.009745,-0.003455,40.729937,-73.986767,40.726482,-73.977022
4997,0.020579,-0.010392,0.017762,40.732297,-73.984612,40.750059,-73.995004
4998,0.010463,0.005015,0.009183,40.773415,-73.958460,40.782598,-73.953445


In [10]:
# 計算結果發現準確度在有domain knoledge的情況下提升
train_x=scaler.fit_transform(df)
print(f'linear reg score:{cross_val_score(Linear,train_x,train_y,cv=5).mean()}')
print(f'gradient boosting reg score:{cross_val_score(GDBT,train_x,train_y,cv=5).mean()}')

linear reg score:0.027396023418401704
gradient boosting reg score:0.8048232605010528


In [13]:
# 觀察結果
import math
latitude_average=df['pickup_latitude'].mean()
latitude_factor=math.cos(latitude_average/180*math.pi) # 經緯度差因子
# 計算距離，並乘上factor
df['distance_real']=((df['longitude_diff']*latitude_factor)**2+df['latitude_diff']**2)**0.5
train_x=scaler.fit_transform(df)
print(f'Linear Reg Score : {cross_val_score(Linear,train_x,train_y,cv=5).mean()}')
print(f'gradient boosting reg score:{cross_val_score(GDBT,train_x,train_y,cv=5).mean()}')

Linear Reg Score : 0.02535400892787678
gradient boosting reg score:0.8004674044409616


In [14]:
# 只用新特徵估計目標值(忽略原特徵)，比較效果如何
train_x=scaler.fit_transform(df[['distance_real']])#進行MinMaxscaler調整
print(f'linear reg score:{cross_val_score(Linear,train_x,train_y,cv=5).mean()}')
print(f'gradient boosting reg score:{cross_val_score(GDBT,train_x,train_y,cv=5).mean()}')

linear reg score:0.0014462469864102933
gradient boosting reg score:0.7236450666734193


In [19]:
# 考慮東西向路程崎嶇，計程車費用反而較高
latitude_factor=math.cos(latitude_average/180*math.pi)*1.3
df['distance_real']=((df['longitude_diff']*latitude_factor)**2+df['latitude_diff']**2)**0.5
train_x=scaler.fit_transform(df)
print(f'Linear Reg Score : {cross_val_score(Linear,train_x,train_y,cv=5).mean()}')
print(f'gradient boosting reg score:{cross_val_score(GDBT,train_x,train_y,cv=5).mean()}')

Linear Reg Score : 0.027557324587768474
gradient boosting reg score:0.8053616489018728
