In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 
import sys

#경고창이 뜨지 않도록 해주는 것
import warnings
warnings.filterwarnings('ignore')

# notebook을 실행한 브라우저에서 바로 그림을 볼 수 있게 해주는 것
%matplotlib inline 

In [2]:
import platform                

from matplotlib import font_manager, rc
plt.rcParams['axes.unicode_minus']= False

if platform.system() == 'Darwin': #맥os 사용자의 경우에
    plt.style.use('seaborn-darkgrid') 
    rc('font', family = 'AppleGothic')
    
elif platform.system() == 'Windows':#윈도우 사용자의 경우에
    path = 'c:/Windows/Fonts/malgun.ttf'
    font_name = font_manager.FontProperties(fname=path).get_name()
    plt.style.use('seaborn-darkgrid') # https://python-graph-gallery.com/199-matplotlib-style-sheets/
    rc('font', family=font_name)

In [12]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

In [4]:
df = pd.read_csv('NYC_taxi_extra_info.csv')

In [5]:
df['weekday_sin']=np.sin(2*np.pi*df.weekday/7)
df['weekday_cos']=np.cos(2*np.pi*df.weekday/7)
df['hour_sin']=np.sin(2*np.pi*df.hour/24)
df['hour_cos']=np.cos(2*np.pi*df.hour/24)

In [6]:
df['pickup_hour'] = pd.to_datetime(df['pickup_hour'])
df['zip_code'] = df['zip_code'].astype(str) 

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87020 entries, 0 to 87019
Data columns (total 27 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   zip_code          87020 non-null  object        
 1   pickup_hour       87020 non-null  datetime64[ns]
 2   month             87020 non-null  int64         
 3   day               87020 non-null  int64         
 4   weekday           87020 non-null  int64         
 5   hour              87020 non-null  int64         
 6   is_weekend        87020 non-null  int64         
 7   cnt               87020 non-null  int64         
 8   lat               87020 non-null  float64       
 9   lon               87020 non-null  float64       
 10  temp              87020 non-null  float64       
 11  humid             87020 non-null  float64       
 12  pressure          87020 non-null  float64       
 13  wind_direction    87020 non-null  float64       
 14  wind_speed        8702

In [8]:
df.columns

Index(['zip_code', 'pickup_hour', 'month', 'day', 'weekday', 'hour',
       'is_weekend', 'cnt', 'lat', 'lon', 'temp', 'humid', 'pressure',
       'wind_direction', 'wind_speed', 'des', 'wind_x', 'wind_y', 'is_raining',
       'is_snowing', 'is_cloudy', 'is_clear', 'weather_strength',
       'weekday_sin', 'weekday_cos', 'hour_sin', 'hour_cos'],
      dtype='object')

In [7]:
x=df[[ 'weekday_sin', 'weekday_cos', 'hour_sin', 'hour_cos', 'is_weekend', 'lat', 'lon', 'temp', 'weather_strength']]
y=df['cnt']

#[ 'weekday_sin', 'weekday_cos', 'hour_sin', 'hour_cos', 'is_weekend', 'lat', 'lon', 'temp', 'is_raining', 'is_snowing', 'is_cloudy', 'is_clear', 'weather_strength']

In [8]:
x

Unnamed: 0,weekday_sin,weekday_cos,hour_sin,hour_cos,is_weekend,lat,lon,temp,weather_strength
0,0.433884,-0.900969,0.000000,1.000000,1,40.761229,-73.929593,-5.100000,0
1,0.433884,-0.900969,0.000000,1.000000,1,40.708312,-74.013440,-5.100000,0
2,0.433884,-0.900969,0.000000,1.000000,1,40.692500,-73.991763,-5.100000,0
3,0.433884,-0.900969,0.000000,1.000000,1,40.653199,-73.784133,-5.100000,0
4,0.433884,-0.900969,0.000000,1.000000,1,40.837391,-73.941015,-5.100000,0
...,...,...,...,...,...,...,...,...,...
87015,-0.974928,-0.222521,-0.258819,0.965926,1,40.632650,-73.996601,-6.175519,0
87016,-0.974928,-0.222521,-0.258819,0.965926,1,40.829743,-73.908570,-6.175519,0
87017,-0.974928,-0.222521,-0.258819,0.965926,1,40.750423,-73.819936,-6.175519,0
87018,-0.974928,-0.222521,-0.258819,0.965926,1,40.627427,-73.945651,-6.175519,0


In [25]:
scaler=MinMaxScaler()

In [12]:
col=['lat','lon','temp']
scaled=scaler.fit_transform(x[col])

In [13]:
scaled_df=pd.DataFrame(scaled,columns=col,index=x.index)
scaled_df

Unnamed: 0,lat,lon,temp
0,0.061959,0.640375,0.397084
1,0.047486,0.618693,0.397084
2,0.043162,0.624299,0.397084
3,0.032413,0.677989,0.397084
4,0.082789,0.637422,0.397084
...,...,...,...
39995,0.039463,0.642778,0.361507
39996,0.070388,0.636535,0.361507
39997,0.051524,0.619749,0.361507
39998,0.075401,0.635007,0.361507


In [14]:
before_merge=x.drop(col,axis=1)
before_merge

Unnamed: 0,weekday_sin,weekday_cos,hour_sin,hour_cos,is_weekend,is_raining,is_snowing,is_cloudy,is_clear,weather_strength
0,0.433884,-0.900969,0.000000,1.000000,1,0,0,0,1,0
1,0.433884,-0.900969,0.000000,1.000000,1,0,0,0,1,0
2,0.433884,-0.900969,0.000000,1.000000,1,0,0,0,1,0
3,0.433884,-0.900969,0.000000,1.000000,1,0,0,0,1,0
4,0.433884,-0.900969,0.000000,1.000000,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
39995,0.433884,-0.900969,0.707107,0.707107,0,0,0,1,0,3
39996,0.433884,-0.900969,0.707107,0.707107,0,0,0,1,0,3
39997,0.433884,-0.900969,0.707107,0.707107,0,0,0,1,0,3
39998,0.433884,-0.900969,0.707107,0.707107,0,0,0,1,0,3


In [15]:
scaled_x=pd.concat([scaled_df,before_merge],axis=1)

In [16]:
scaled_x

Unnamed: 0,lat,lon,temp,weekday_sin,weekday_cos,hour_sin,hour_cos,is_weekend,is_raining,is_snowing,is_cloudy,is_clear,weather_strength
0,0.061959,0.640375,0.397084,0.433884,-0.900969,0.000000,1.000000,1,0,0,0,1,0
1,0.047486,0.618693,0.397084,0.433884,-0.900969,0.000000,1.000000,1,0,0,0,1,0
2,0.043162,0.624299,0.397084,0.433884,-0.900969,0.000000,1.000000,1,0,0,0,1,0
3,0.032413,0.677989,0.397084,0.433884,-0.900969,0.000000,1.000000,1,0,0,0,1,0
4,0.082789,0.637422,0.397084,0.433884,-0.900969,0.000000,1.000000,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,0.039463,0.642778,0.361507,0.433884,-0.900969,0.707107,0.707107,0,0,0,1,0,3
39996,0.070388,0.636535,0.361507,0.433884,-0.900969,0.707107,0.707107,0,0,0,1,0,3
39997,0.051524,0.619749,0.361507,0.433884,-0.900969,0.707107,0.707107,0,0,0,1,0,3
39998,0.075401,0.635007,0.361507,0.433884,-0.900969,0.707107,0.707107,0,0,0,1,0,3


In [9]:
y

0         75
1         64
2        169
3        222
4         64
        ... 
87015      1
87016      3
87017      1
87018      1
87019      1
Name: cnt, Length: 87020, dtype: int64

In [10]:
model=ExtraTreesRegressor(n_estimators=350)

In [59]:
#MinMaxScaler 적용
score=cross_val_score(model,scaled_x,y,scoring="neg_mean_absolute_error",cv=5)
print("score: {}".format(score))
print("average: "+ str(sum(score)/len(score)))

score: [-44.21144162 -23.22713801 -25.09198518 -24.71198805 -39.65467077]
average: -31.379444725350492


In [27]:
#scaler 없이!
score=cross_val_score(model,x,y,scoring="neg_mean_absolute_error",cv=5)
print("score: {}".format(score))
print("average: "+ str(sum(score)/len(score)))

score: [-44.25108743 -23.21250648 -24.91760827 -24.79787586 -39.68769232]
average: -31.373354072955316


In [13]:
#RF에서 한 것처럼 영향없는 변수 4개 제거하고 GridSearch
params={
    'n_estimators':[300,400],
    'min_samples_split':[2,3],
    'min_samples_leaf':[1,2]
}
model=ExtraTreesRegressor()
grid=GridSearchCV(model,param_grid=params,cv=5,refit=True,scoring='neg_mean_absolute_error')
grid.fit(x,y)
print('best parameters: ',grid.best_params_)
print('best score: ',grid.best_score_)

best parameters:  {'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 400}
best score:  -30.493038942197195


In [14]:
#GridSearch
params={
    'n_estimators':[400,500],
    'min_samples_split':[3,4],
    'min_samples_leaf':[1,2]
}
model=ExtraTreesRegressor()
grid=GridSearchCV(model,param_grid=params,cv=5,refit=True,scoring='neg_mean_absolute_error')
grid.fit(x,y)
print('best parameters: ',grid.best_params_)
print('best score: ',grid.best_score_)

best parameters:  {'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 500}
best score:  -30.484351551367503


### 그리드서치 결과, ExtraTreesRegressor()의 n_estimator=500, min_samples_leaf=1, min_samples_split=3일 때 최소 MAE 30.48이 나왔음.