In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#데이터 불러오기

df = pd.read_csv('/kaggle/input/london-bike-sharing-dataset/london_merged.csv', parse_dates = ['timestamp'])
df.head()

#### 변수 설명
##### CNT : 시간대별 자전거 이용수
##### T1 :  시간대별 온도
##### T2 : 체감 온도
##### hum : 습도
##### wind_speed : 풍속
##### weather_code : 날씨 코드
##### is_holiday : 1 공휴일, 0 비공휴일
##### is_weekend : 평일, 주말 1
##### season : 0봄~4겨울

In [None]:
#데이터의 타입과 구조

print('데이터의 타입은:',df.dtypes)
print('데이터의 구조는:',df.shape)
print('데이터의 컬럼은:',df.columns)

In [None]:
# 결측치 확인

df.isna().sum()

In [None]:
# 결측치 시각화
msno.matrix(df)
plt.show()

In [None]:
# 시간변수 생성

df["year"] = df["timestamp"].dt.year
df['month'] = df["timestamp"].dt.month
df["dayofweek"] = df["timestamp"].dt.dayofweek
df["hour"] = df["timestamp"].dt.hour
df.head()

In [None]:
df["year"].value_counts()

In [None]:
# 탐색적 분석
# 년도별

a,b = plt.subplots(1,1, figsize=(10,5))
sns.boxplot(df['year'], df['cnt'])

In [None]:
# 월별

a,b = plt.subplots(1,1, figsize=(10,5))
sns.boxplot(df['month'], df['cnt'])

In [None]:
# 요일별 

a,b = plt.subplots(1,1, figsize=(10,5))
sns.boxplot(df['dayofweek'], df['cnt'])

In [None]:
# 시간대별 

a,b = plt.subplots(1,1, figsize=(10,5))
sns.boxplot(df['hour'], df['cnt'])

In [None]:
# 그래프 함수 만들기

def plot_bar(data, feature):
    fig = plt.figure(figsize=(12,3))
    sns.barplot(x=feature, y='cnt', data=data, palette='Set3',orient='v')

In [None]:
plot_bar(df, 'weather_code')

In [None]:
plot_bar(df, 't2')

In [None]:
#아웃라이어 제거

def is_outlier(s):
    lower_limit = s.mean() - (s.std() * 3)
    upper_limit = s.mean() + (s.std() * 3)
    return ~s.between(lower_limit, upper_limit)

In [None]:
df_out = df[~df.groupby('hour')['cnt'].apply(is_outlier)]

print ("이상치 제거전: ",df.shape)
print ("이상치 제거후: ",df_out.shape)

In [None]:
df_out.dtypes

In [None]:
df_out['weather_code'] = df_out['weather_code'].astype('category')
df_out['season'] = df_out['season'].astype('category')
df_out['year'] = df_out['year'].astype('category')
df_out['month'] = df_out['month'].astype('category')
df_out['hour'] = df_out['hour'].astype('category')

In [None]:
df_out = pd.get_dummies(df_out, columns = ['weather_code', 'season', 'year', 'month', 'hour'])
df_out.head()

In [None]:
# x, y형태 데이터 분리

df_y  = df_out["cnt"]
df_x  = df_out.drop(['timestamp', 'cnt'],axis=1)
df_x.head()

In [None]:
#훈련용, 테스트용 데이터 분리

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, random_state=66, test_size=0.3, shuffle=False)

In [None]:
print('x_train shape:', x_train.shape)
print('y_train shape :', y_train.shape)

print('x_test shape :', x_test.shape)
print('y_test shape :', y_test.shape)

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping

model = Sequential()
model.add(Dense(units=160, activation='relu', input_dim=57))
model.add(Dense(units=60, activation='relu'))
model.add(Dense(units=20, activation='relu'))
model.add(Dense(units=1, activation='linear'))

In [None]:
model.compile(loss='mae', optimizer='adam', metrics=['mae'])
early_stopping = EarlyStopping(monitor='loss', patience=5, mode='min')
history = model.fit(x_train, y_train, epochs=50, batch_size=1, validation_split = 0.1, callbacks=[early_stopping])

In [None]:
plt.plot(history.history['val_loss'])
plt.plot(history.history['loss'])
plt.title("Model val_loss")
plt.xlabel('Epochs')
plt.ylabel('val_loss')
plt.legend(['val_loss','loss'])
plt.show()

In [None]:
y_predict = model.predict(x_test)

In [None]:
from sklearn.metrics import mean_squared_error

def RMSE(y_test, y_predict):
    return np.sqrt(mean_squared_error(y_test, y_predict))
print("RMSE:", RMSE(y_test, y_predict))

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, random_state=16) 
rf.fit(x_train, y_train)
rf_result = rf.predict(x_test)
print("RMSE:", RMSE(y_test, rf_result))

In [None]:
from xgboost import XGBRegressor
xgb = XGBRegressor(n_estimators=100, random_state=16)
xgb.fit(x_train, y_train)
xgb_result = xgb.predict(x_test)
print("RMSE:", RMSE(y_test, xgb_result))

In [None]:
from lightgbm import LGBMRegressor
lgb = LGBMRegressor(n_estimators=100, random_state=16)
lgb.fit(x_train, y_train)
lgb_result = lgb.predict(x_test)
print("RMSE:", RMSE(y_test, lgb_result))

In [None]:
xgb=pd.DataFrame(xgb_result)
rf=pd.DataFrame(rf_result)
dnn=pd.DataFrame(y_predict)
compare = pd.DataFrame(y_test).reset_index(drop=True)

In [None]:
compare['xgb']=xgb
compare['rf']=rf
compare['dnn']=dnn
compare.head()

In [None]:
sns.kdeplot(compare['cnt'],shade=True,color='r')
sns.kdeplot(compare['xgb'],shade=True,color='b')
sns.kdeplot(compare['rf'],shade=True,color='y')
sns.kdeplot(compare['dnn'],shade=True,color='g')

plt.xlabel('Values')
plt.ylabel('Frequency')
plt.title('Kernel Density Estimation Plot')
plt.show()