In [None]:
%config Completer.use_jedi = False

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os

data_path = '/kaggle/input/bike-sharing-demand/'

train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')
submission = pd.read_csv(data_path + 'sampleSubmission.csv')

In [None]:
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
train = train[train['weather'] != 4]
all_data = pd.concat([train, test],ignore_index=True)

In [None]:
all_data['date'] = all_data['datetime'].apply(lambda x: x.split()[0])

all_data['year'] = all_data['datetime'].apply(lambda x: x.split()[0].split('-')[0])
all_data['month'] = all_data['datetime'].apply(lambda x: x.split()[0].split('-')[1])
all_data['hour'] = all_data['datetime'].apply(lambda x: x.split()[1].split(':')[0])

from datetime import datetime
import calendar

all_data['weekday'] = all_data['date'].apply(
    lambda dateString:
    datetime.strptime(dateString, "%Y-%m-%d").weekday()
)



drop_features = ['casual', 'registered', 'datetime', 'date', 'month', 'windspeed']
all_data = all_data.drop(drop_features, axis=1)


In [None]:
X_train = all_data[~pd.isnull(all_data['count'])]
X_test = all_data[pd.isnull(all_data['count'])]

X_train = X_train.drop(['count'], axis=1)
X_test = X_test.drop(['count'], axis=1)

y = train['count']

X_train.head()

In [None]:
def rmsle(y_true, y_pred, convertExp=True):
    if convertExp:
        y_true = np.exp(y_true)
        y_pred = np.exp(y_pred)
        
    log_true = np.nan_to_num(np.log(y_true+1))
    log_pred = np.nan_to_num(np.log(y_pred+1))
    
    output = np.sqrt(np.mean((log_true - log_pred)**2))
    return output

In [None]:
from sklearn.linear_model import LinearRegression

linear_reg_model = LinearRegression()

log_y = np.log(y)
linear_reg_model.fit(X_train,log_y)

preds = linear_reg_model.predict(X_train)

print(f'RMSLE: {rmsle(log_y, preds, True): .4f}')

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

ridge_model = Ridge()

ridge_params = {
    'max_iter': [3000],
    'alpha': [0.1, 1, 2, 3, 4, 10, 30, 100, 200, 300, 400, 800, 900, 1000]
}
# 교차 검증 평가 함수(RMSLE)
rmsle_scorer = metrics.make_scorer(rmsle, greater_is_better=False)
# 그리드서치 객체 생성
gridsearch_ridge_model = GridSearchCV(estimator=ridge_model,
                                     param_grid=ridge_params,
                                     scoring=rmsle_scorer,
                                     cv=5)

log_y = np.log(y)
gridsearch_ridge_model.fit(X_train, log_y)

print('최적 파라미터', gridsearch_ridge_model.best_params_)

In [None]:
preds = gridsearch_ridge_model.best_estimator_.predict(X_train)


print(f'RMSLE: {rmsle(log_y, preds, True): .4f}')

In [None]:
from sklearn.linear_model import Lasso

lasso_model = Ridge()
lasso_alpha = 1/np.array([0.1, 1, 2, 3, 4, 10, 30,
                          100, 200, 300, 400, 800, 900, 1000])

lasso_params = {'max_iter': [3000],'alpha': lasso_alpha}

# 그리드서치 객체 생성
gridsearch_lasso_model = GridSearchCV(estimator=lasso_model,
                                     param_grid=lasso_params,
                                     scoring=rmsle_scorer,
                                     cv=5)

log_y = np.log(y)
gridsearch_lasso_model.fit(X_train, log_y)

print('최적 파라미터', gridsearch_lasso_model.best_params_)

In [None]:
preds = gridsearch_lasso_model.best_estimator_.predict(X_train)


print(f'RMSLE: {rmsle(log_y, preds, True): .4f}')

In [None]:
from sklearn.ensemble import RandomForestRegressor

randomforest_model = RandomForestRegressor()

rf_params = {'random_state': [42],'n_estimators': [100, 120, 140]}

# 그리드서치 객체 생성
gridsearch_randomforest_model = GridSearchCV(estimator=randomforest_model,
                                     param_grid=rf_params,
                                     scoring=rmsle_scorer,
                                     cv=5)

log_y = np.log(y)
gridsearch_randomforest_model.fit(X_train, log_y)

print('최적 파라미터', gridsearch_randomforest_model.best_params_)

In [None]:
preds = gridsearch_randomforest_model.best_estimator_.predict(X_train)


print(f'RMSLE: {rmsle(log_y, preds, True): .4f}')

In [None]:
# output
randomforest_preds = gridsearch_randomforest_model.best_estimator_.predict(X_test)

submission['count'] = np.exp(randomforest_preds)
submission.to_csv('submission.csv', index=False)