# 1. 문제정의
- 자전거 렌탈 수요량을 예측
- 지도학습/회귀

# 2. 데이터수집

In [None]:
import pandas as pd
import numpy as np

In [None]:
train = pd.read_csv("./data/bike-sharing system/train.csv")
test = pd.read_csv("./data/bike-sharing system/test.csv")

In [None]:
train.shape, test.shape

In [None]:
train.head(5)

- datatime = 날짜
- season = 계절(1-봄 / 2-여름 / 3-가을 / 4-겨울)
- holiday = 공휴일(1-공휴일 / 0-비공휴일)
- workingday = 일하는 날(1-일하는날 / 0-쉬는날)
- weather = 날씨
- (1-맑음,약간구름,부분구름 / 2-안개+흐림, 안개+끊어진구름, 안개+약간구름, 안개 /
-  3-적은눈, 적은비+뇌우+흩어진구름, 적은비+흩어진구름 /
-  4-폭우+우박+뇌우+안개, 눈+안개)
- temp = 섭씨 온도
- atemp = 체감 섭씨 온도
- humidity = 상대 습도
- windspeed = 풍속
- casual = 등록되지 않은 사용자 렌탈수
- registered = 등록된 사용자 렌탈수
- count = 렌탈 수요량

In [None]:
test.head(5)

# 3. 전처리 & EDA
- 결측치 처리
- 인코딩
- 특성공학

#### 결측치 처리

In [None]:
# 결측치 확인
train.info()

In [None]:
test.info()

#### 날짜 전처리

In [None]:
train.datetime[0]

In [None]:
train.datetime[0].split(' ')

In [None]:
train.datetime[0].split(' ')[0].split('-')

In [None]:
train.datetime[0].split(' ')[0].split('-')[0]

In [None]:
# parse_dates = 시간형태로 불러옴
train = pd.read_csv("./data/bike-sharing system/train.csv", parse_dates=['datetime'])
test = pd.read_csv("./data/bike-sharing system/test.csv", parse_dates=['datetime'])

In [None]:
train.info()

In [None]:
train['year']=train['datetime'].dt.year
train['month']=train['datetime'].dt.month
train['day']=train['datetime'].dt.day
train['hour']=train['datetime'].dt.hour
train['minute']=train['datetime'].dt.minute
train['second']=train['datetime'].dt.second
train['dayofweek']=train['datetime'].dt.dayofweek #요일

In [None]:
train.shape

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
figure,((ax1,ax2,ax3),(ax4,ax5,ax6))=plt.subplots(nrows=2,ncols=3) #2행 3열 
figure.set_size_inches(16,8)
sns.barplot(data=train, x='year', y='count', ax=ax1)
sns.barplot(data=train, x='month', y='count', ax=ax2)
sns.barplot(data=train, x='day', y='count', ax=ax3)
sns.barplot(data=train, x='hour', y='count', ax=ax4)
sns.barplot(data=train, x='minute', y='count', ax=ax5)
sns.barplot(data=train, x='second', y='count', ax=ax6)


In [None]:
test

In [None]:
feature=['year','month','hour']
cat_feature=[]

In [None]:
sns.barplot(data=train, x='dayofweek', y='count')


In [None]:
test['year']=test['datetime'].dt.year
test['month']=test['datetime'].dt.month
test['day']=test['datetime'].dt.day
test['hour']=test['datetime'].dt.hour
test['minute']=test['datetime'].dt.minute
test['second']=test['datetime'].dt.second
test['dayofweek']=test['datetime'].dt.dayofweek #요일

#### 계절

In [None]:
sns.barplot(data=train, x='season', y='count')

In [None]:
#replace 대신 쓸수있는것 map({딕셔너리형태})
train['season']=train['season'].map({1:'Spring',2:'Summer',3:'Fall',4:'Winter'})

In [None]:
test['season']=test['season'].map({1:'Spring',2:'Summer',3:'Fall',4:'Winter'})

In [None]:
cat_feature += ['season']

#### 공휴일

In [None]:
# 평균
# 막대는 표준편차
sns.barplot(data=train, x='holiday', y='count')

In [None]:
#가운데 값 - 중앙값
#box의 있는 부분 1사분위(25%)
#box의 윗부분(75%)
#이상치
sns.boxplot(data=train, x='holiday', y='count')

In [None]:
feature += ['holiday']

#### 일하는날

In [None]:
sns.barplot(data=train, x='workingday', y='count')

In [None]:
sns.boxplot(data=train, x='workingday', y='count')

In [None]:
#시간에 따라 수요량 holi,working day 보기
sns.pointplot(data=train, x="hour", y="count", hue="workingday")

In [None]:
sns.pointplot(data=train, x="hour", y="count", hue="holiday")

In [None]:
sns.pointplot(data=train, x="hour", y="count", hue="dayofweek")

In [None]:
feature += ["workingday"]
cat_feature +=["dayofweek"]

#### 날씨

In [None]:
sns.barplot(data=train, x='weather', y='count')

In [None]:
train[train['weather']==4]

In [None]:
test[test['weather']==4]

In [None]:
cat_feature += ['weather']

#### 온도, 체감온도, 습도, 풍속, 등록되지않는 사용자, 등록된 사용자, 전체개수

In [None]:
train[['temp','atemp','humidity','windspeed','casual','registered','count']].corr()

In [None]:
#상관계수에 따라 색이다름
heatmap =sns.heatmap(train[['temp','atemp','humidity','windspeed','casual','registered','count']].corr(), annot=True)
#y limit
heatmap.get_ylim()
heatmap.set_ylim(0,7)

In [None]:
#온도에 따른 수요량
sns.lineplot(data=train, x='temp',y='count')

In [None]:
sns.lineplot(data=train, x='temp',y='casual', label ='casual')
sns.lineplot(data=train, x='temp',y='registered' ,label='registered')

In [None]:
# 스케일링 사용해서 분포 맞춰줌 - casual , registered 사용량이 다르기 떄문에 정확한 차이를 판단하기 어려움.
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
scaler.fit(train[['registered']]) #학습
train['registered_sc']= scaler.transform(train[['registered']]) #변형
scaler1 = StandardScaler()
train['casual_sc']=scaler1.fit_transform(train[['casual']]) #학습을 하면서 변화

In [None]:
sns.lineplot(data=train, x='temp',y='casual_sc', label ='casual')
sns.lineplot(data=train, x='temp',y='registered_sc' ,label='registered')

In [None]:
sns.lineplot(data=train, x='atemp',y='casual_sc', label ='casual')
sns.lineplot(data=train, x='atemp',y='registered_sc' ,label='registered')

-특성이 비슷한것이 많으면 과대적합 생기기 쉬움 => atemp 뺌

In [None]:
sns.lineplot(data=train, x='humidity',y='casual_sc', label ='casual')
sns.lineplot(data=train, x='humidity',y='registered_sc' ,label='registered')

In [None]:
train[train['humidity']<20]

In [None]:
sns.lineplot(data=train, x='windspeed',y='casual_sc', label ='casual')
sns.lineplot(data=train, x='windspeed',y='registered_sc' ,label='registered')

In [None]:
train[train['windspeed']>45]

In [None]:
feature += ['temp','humidity']

In [None]:
X_train = train[feature]
y_train = train['count']
X_test = test[feature]

In [None]:
for name in cat_feature:
        dummy = pd.get_dummies(train[name],prefix=name)
        X_train=pd.concat([X_train, dummy], axis=1)

In [None]:
for name in cat_feature:
        dummy = pd.get_dummies(test[name],prefix=name)
        X_test=pd.concat([X_test, dummy], axis=1)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

In [None]:
from sklearn.metrics import mean_squared_log_error,make_scorer

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge

In [None]:
msle_scorer = make_scorer(mean_squared_log_error)

In [None]:
아ㅏㅏㅏ앍

In [160]:
y_train_log = np.log1p(y_train)

In [161]:
ridge = Ridge()
cross_val_score(ridge, X_train, y_train, cv=5, scoring = msle_scorer ).mean()

ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.

- 분류.score: 정확도
- 회귀.score: 결정계수(R^2)

In [162]:
mean_squared_log_error=(y_train, np.exp(ridge.predict(X_test)))

NotFittedError: This Ridge instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.