In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

import string
import warnings
import missingno
warnings.filterwarnings('ignore')

In [2]:
df_train = pd.read_csv('../Study_For_MachineLearning/Bike/train.csv')
df_test = pd.read_csv('../Study_For_MachineLearning/Bike/test.csv')
df_all = pd.concat((df_train, df_test)).reset_index(drop=True)

In [3]:
print(df_train.shape)

(10886, 12)


In [4]:
def split_df(df):
    return df[:10886], df[10886:]

### RMSLE 기반 예측을 위해서 Log필드를 추가한다.

In [5]:
df_all['casual_log'] = np.log(df_all['casual'] + 1)
df_all['registered_log'] = np.log(df_all['registered'] + 1)
df_all['count_log'] = np.log(df_all['count']+1)

### 시간필드 추가

In [6]:
dt = pd.DatetimeIndex(df_all['datetime'])
df_all.set_index(dt, inplace=True)

df_all['date'] = dt.date
df_all['day'] = dt.day
df_all['month'] = dt.month
df_all['year'] = dt.year
df_all['hour'] = dt.hour
df_all['dow'] = dt.dayofweek
df_all['woy'] = dt.weekofyear

### 바쁜 시간대 타임 필드 추가

In [14]:
def func(df_data):
    if df_data['workingday'] == 1:
        if (df_data['hour'] == 8) or (df_data['hour'] == 17) or (df_data['hour'] == 18):
            return 4
        elif (df_data['hour'] == 7) or (df_data['hour'] == 16) or (df_data['hour'] == 19):
            return 3
    else:
        if (df_data['hour'] >= 12 and df_data['hour'] <= 16):
            return 2
        elif (df_data['hour'] >= 10 and df_data['hour'] <= 19):
            return 1
    return 0
df_all['peak'] = df_all.apply(func, axis=1)

### holiday 필드 추가

In [15]:
def func(df_data):
    if (df_data['month'] == 12) and (df_data['day'] == 24 or df_data['day'] == 31):
        return 1
    return df_data['holiday']
df_all['holiday'] = df_all.apply(func, axis=1)

### workingday 필드 추가

In [16]:
def func(df_data):
    if (df_data['month'] == 12) and (df_data['day'] == 24 or df_data['day'] == 31):
        return 0
    return df_data['workingday']
df_all['workingday'] = df_all.apply(func, axis=1)

### 온도, 풍속, 습도, 날씨 기반 fit & humid 필드 추가

In [17]:
def func(df_data):
    if (df_data['weather'] <= 2 and df_data['windspeed'] <= 20):
        if (df_data['temp'] > 15 and df_data['temp'] <= 35):
            return 1
    return 0

df_all['fit'] = df_all.apply(func, axis=1)

In [18]:
def func(df_data):
    if df_data['humidity'] >= 70:
        return 1
    return 0

df_all['humid'] = df_all.apply(func, axis=1)

$$ RMSLE = \sqrt{\dfrac{\sum_{i=0}^N (log(y_i + 1) - log(\hat{y_i} + 1))^2 }{N}} $$

In [19]:
from sklearn.metrics import make_scorer

def get_rmsle(y_actual, y_pred):
    diff = np.log(y_pred + 1) - np.log(y_actual + 1)
    mean_error = np.square(diff).mean()
    return np.sqrt(mean_error)

In [21]:
def predict_bikecount(model, select_columns):
    df_train, df_test = split_df(df_all)

    X_train = df_train[select_columns]
    y_train_cas = df_train['casual_log']
    y_train_reg = df_train['registered_log']
    X_test = df_test[select_columns]

    casual_model = model.fit(X_train, y_train_cas)
    y_pred_cas = casual_model.predict(X_test)
    y_pred_cas = np.exp(y_pred_cas) - 1

    registered_model = model.fit(X_train, y_train_reg)
    y_pred_reg = registered_model.predict(X_test)
    y_pred_reg = np.exp(y_pred_reg) - 1

    return y_pred_cas + y_pred_reg

### LinearRegression

In [22]:
df_train, df_test = split_df(df_all)
ml_columns = [
    'season', 'holiday', 'workingday', 'weather', 'temp',
    'atemp', 'humidity', 'windspeed', 'day', 'month',
    'year', 'hour', 'dow', 'woy', 'peak', 'fit', 'humid'
]
X_train = df_train[ml_columns].copy()
y_train = df_train['count']
rmsle_scorer = make_scorer(get_rmsle, greater_is_better=False)

In [23]:
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()
ml_pred = predict_bikecount(lr_model, ml_columns)
df_test['count'] = ml_pred
final_df = df_test[['datetime', 'count']].copy()
final_df.to_csv('submissions_linear.csv', header=True, index=False)

In [24]:
!kaggle competitions submit -c bike-sharing-demand -f submissions_linear.csv -m "Message"

Successfully submitted to Bike Sharing Demand



  0%|          | 0.00/250k [00:00<?, ?B/s]
  3%|3         | 8.00k/250k [00:00<00:03, 69.7kB/s]
 77%|#######6  | 192k/250k [00:00<00:00, 1.06MB/s] 
100%|##########| 250k/250k [00:02<00:00, 98.4kB/s]


### Lasso Regression

In [25]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

hyperparams = {
    'max_iter' : [1000, 1500, 2000, 2500, 3000],
    'alpha': 1/np.array([0.1, 1, 2, 3, 4, 10, 30,100,200,300,400,800,900,1000])
}
lasso_grid = GridSearchCV(
    estimator=Lasso(),
    param_grid=hyperparams,
    verbose=True,
    scoring=rmsle_scorer,
    cv=5,
    n_jobs=-1
)
lasso_grid.fit(X_train, y_train)
print(lasso_grid.best_params_)

Fitting 5 folds for each of 70 candidates, totalling 350 fits
{'alpha': 10.0, 'max_iter': 1000}


In [26]:
lasso_model = lasso_grid.best_estimator_
ml_pred = predict_bikecount(lasso_model, ml_columns)
df_test['count'] = ml_pred
final_df = df_test[['datetime', 'count']].copy()
final_df.to_csv('submissions_lasso.csv', header=True, index=False)
!kaggle competitions submit -c bike-sharing-demand -f submissions_lasso.csv -m "Message"

Successfully submitted to Bike Sharing Demand



  0%|          | 0.00/246k [00:00<?, ?B/s]
  3%|3         | 8.00k/246k [00:00<00:03, 66.5kB/s]
 75%|#######4  | 184k/246k [00:00<00:00, 970kB/s]  
100%|##########| 246k/246k [00:03<00:00, 82.7kB/s]


### Ridge

In [27]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

hyperparams = {'max_iter': [1000, 1500, 2000, 2500, 3000],
               'alpha':[0.1, 1, 2, 3, 4, 10, 30,100,200,300,400,800,900,1000]
               }
ridge_grid = GridSearchCV(
    estimator=Ridge(),
    param_grid=hyperparams,
    verbose=True,
    scoring=rmsle_scorer,
    cv=5,
    n_jobs=-1
)
ridge_grid.fit(X_train, y_train)
print(ridge_grid.best_params_)
print(ridge_grid.best_estimator_)

Fitting 5 folds for each of 70 candidates, totalling 350 fits
{'alpha': 1000, 'max_iter': 1000}
Ridge(alpha=1000, max_iter=1000)


In [28]:
ridge_model = ridge_grid.best_estimator_
ml_pred = predict_bikecount(ridge_model, ml_columns)
df_test['count'] = ml_pred
final_df = df_test[['datetime', 'count']].copy()
final_df.to_csv('submissions_ridge.csv', header=True, index=False)
!kaggle competitions submit -c bike-sharing-demand -f submissions_ridge.csv -m "Message"

Successfully submitted to Bike Sharing Demand



  0%|          | 0.00/250k [00:00<?, ?B/s]
  3%|3         | 8.00k/250k [00:00<00:03, 70.0kB/s]
100%|##########| 250k/250k [00:02<00:00, 90.2kB/s] 


### Random Forest Regressor

In [29]:
from sklearn.model_selection import GridSearchCV

n_estimators = [800, 1000, 1200]
max_depth = [10, 12, 15]
min_samples_split = [4, 5, 6]
min_samples_leaf = [4, 5, 6]

hyperparams = {'n_estimators': n_estimators, 'max_depth': max_depth,
               'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf}
rf_grid = GridSearchCV(estimator = RandomForestRegressor(),
                       param_grid = hyperparams,
                       verbose=True, scoring=rmsle_scorer, cv=5, n_jobs=-1)
rf_grid.fit(X_train, y_train)
print(rf_grid.best_params_)
## 너무 오래 걸려서 중지시킴.

Fitting 5 folds for each of 81 candidates, totalling 405 fits


KeyboardInterrupt: 

In [None]:
# rf_model = rf_grid.best_estimator_
# ml_pred = predict_bikecount(rf_model, ml_columns)
# df_test['count'] = ml_pred
# final_df = df_test[['datetime', 'count']].copy()
# final_df.to_csv('submissions_rf.csv', header=True, index=False)
# !kaggle competitions submit -c bike-sharing-demand -f submissions_rf.csv -m "Message"

### XGBOOST

In [30]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

hyperparams = {'nthread':[4],
               'learning_rate': [0.05, 0.1, 0.15],
               'max_depth': [4, 5],
               'min_child_weight': [3, 4, 5],
               'subsample': [0.7, 0.8],
               'colsample_bytree': [0.6, 0.7],
               'n_estimators': [250, 500]}

xgb_grid = GridSearchCV(estimator = XGBRegressor(),
                        param_grid = hyperparams,
                        verbose=True, scoring=rmsle_scorer, cv=5, n_jobs=-1)

xgb_grid.fit(X_train, y_train)
print(xgb_grid.best_params_)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
{'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 250, 'nthread': 4, 'subsample': 0.7}


In [31]:
xgb_model = xgb_grid.best_estimator_
ml_pred = predict_bikecount(xgb_model, X_train.columns)
df_test['count'] = ml_pred
final_df = df_test[['datetime', 'count']].copy()
final_df.to_csv('submissions_xgboost.csv', header=True, index=False)
!kaggle competitions submit -c bike-sharing-demand -f submissions_xgboost.csv -m "Message"

Successfully submitted to Bike Sharing Demand



  0%|          | 0.00/194k [00:00<?, ?B/s]
  4%|4         | 8.00k/194k [00:00<00:02, 65.5kB/s]
 95%|#########4| 184k/194k [00:00<00:00, 981kB/s]  
100%|##########| 194k/194k [00:02<00:00, 68.1kB/s]


### Gradient Boosting Regressor

In [32]:
from sklearn.model_selection import GridSearchCV

n_estimators = [100, 150, 200]
max_depth = [5, 7, 9]
min_samples_leaf = [8, 10, 12]
learning_rate = [0.1, 0.15, 0.2]
subsample = [0.6, 0.7, 0.8]

hyperparams = {'n_estimators': n_estimators, 'max_depth': max_depth,
               'min_samples_leaf': min_samples_leaf,
               'learning_rate': learning_rate, 'subsample': subsample
               }

gb_grid=GridSearchCV(estimator = GradientBoostingRegressor(), param_grid = hyperparams,
                     verbose=True, scoring=rmsle_scorer, cv=5, n_jobs=-1)

gb_grid.fit(X_train, y_train)
print(gb_grid.best_params_)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
{'learning_rate': 0.1, 'max_depth': 9, 'min_samples_leaf': 8, 'n_estimators': 100, 'subsample': 0.7}


In [33]:
gb_model = gb_grid.best_estimator_
ml_pred = predict_bikecount(gb_model, X_train.columns)
df_test['count'] = ml_pred
final_df = df_test[['datetime', 'count']].copy()
final_df.to_csv('submissions_gb.csv', header=True, index=False)
!kaggle competitions submit -c bike-sharing-demand -f submissions_gb.csv -m "Message"

Successfully submitted to Bike Sharing Demand



  0%|          | 0.00/250k [00:00<?, ?B/s]
  3%|3         | 8.00k/250k [00:00<00:03, 63.0kB/s]
 74%|#######3  | 184k/250k [00:00<00:00, 946kB/s]  
100%|##########| 250k/250k [00:03<00:00, 71.9kB/s]
