In [None]:
# This R environment comes with many helpful analytics packages installed
# It is defined by the kaggle/rstats Docker image: https://github.com/kaggle/docker-rstats
# For example, here's a helpful package to load

library(tidyverse) # metapackage of all tidyverse packages
library(rpart)
library(rattle)
library(rpart.plot)
library(RColorBrewer)
library(randomForest)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

list.files(path = "../input")

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = read.csv(file="../input/bike-sharing-demand/train.csv")
test = read.csv(file="../input/bike-sharing-demand/test.csv")

head(train)

In [None]:
train$season=as.factor(train$season)
train$weather=as.factor(train$weather)
train$holiday=as.factor(train$holiday)
train$workingday=as.factor(train$workingday)
head(train)

In [None]:
test$season=as.factor(test$season)
test$weather=as.factor(test$weather)
test$holiday=as.factor(test$holiday)
test$workingday=as.factor(test$workingday)
head(test)

In [None]:
test$casual=0
test$registered=0
test$count=0

data = rbind(train, test)

str(data)
summary(data)

In [None]:
# datetime 변수에서 시간데이터 추출
train$hour=substr(train$datetime,12,13)
train$hour=as.factor(train$hour)

test$hour=substr(test$datetime,12,13)
test$hour=as.factor(test$hour)

head(train)

In [None]:
boxplot(train$count~train$hour,xlab="hour", ylab="count of users")

In [None]:
boxplot(train$casual~train$hour,xlab="hour", ylab="casual users")

In [None]:
boxplot(train$registered~train$hour,xlab="hour", ylab="registered users")

In [None]:
# 트레인 일자 확인
date=substr(train$datetime,9,10)
train$date=as.numeric(date)
hist(train$date,xlim = range(1:31))

In [None]:
# 테스트 일자 확인
date=substr(test$datetime,9,10)
test$date=as.numeric(date)
hist(test$date,xlim = range(1:31))

In [None]:
# 주수 확인
date=substr(train$datetime,1,10)
days<-weekdays(as.Date(date))
train$day=days

date=substr(test$datetime,1,10)
days<-weekdays(as.Date(date))
test$day=days

In [None]:
boxplot(train$registered~train$day,xlab="day", ylab="registered users")

In [None]:
boxplot(train$casual~train$day,xlab="day", ylab="casual users")

In [None]:
boxplot(train$registered~train$weather,xlab="weather", ylab="registered users")

In [None]:
boxplot(train$casual~train$weather,xlab="weather", ylab="casual users")

In [None]:
boxplot(train$registered~train$temp,xlab="temp", ylab="registered users")

In [None]:
boxplot(train$casual~train$temp,xlab="temp", ylab="casual users")

In [None]:
# year 팩터로 전환
train$year=substr(train$datetime,1,4)
train$year=as.factor(train$year)

test$year=substr(test$datetime,1,4)
test$year=as.factor(test$year)

In [None]:
boxplot(train$registered~train$year,xlab="year", ylab="registered users")

In [None]:
boxplot(train$casual~train$year,xlab="year", ylab="casual users")

In [None]:
boxplot(train$registered~train$windspeed,xlab="year", ylab="registered users")

In [None]:
boxplot(train$casual~train$windspeed,xlab="year", ylab="casual users")

In [None]:
boxplot(train$registered~train$humidity,xlab="humidity", ylab="registered users")

In [None]:
boxplot(train$casual~train$humidity,xlab="humidity", ylab="casual users")

In [None]:
# 시간 변환
train$hour=as.integer(train$hour)
test$hour=as.integer(test$hour)

data=rbind(train,test)

In [None]:
# 시간 특성 파악
# rpart 패키지의 rpart는 recursive partitioning and regression tree의 약어
d=rpart(registered~hour,data=train)
d
#의사결정나무를 시각화 할 수 있게 도와주는 rattle 패키지의 fancyrpartplot 함수
fancyRpartPlot(d)

In [None]:

#register 유저 분류
data=rbind(train,test)
data$dp_reg=0
data$dp_reg[data$hour<8]=1
data$dp_reg[data$hour>=22]=2
data$dp_reg[data$hour>9 & data$hour<18]=3
data$dp_reg[data$hour==8]=4
data$dp_reg[data$hour==9]=5
data$dp_reg[data$hour==20 | data$hour==21]=6
data$dp_reg[data$hour==19 | data$hour==18]=7

In [None]:
d=rpart(casual~hour,data=train)
d
fancyRpartPlot(d)

In [None]:
# casual 유저분류
data$dp_cas=0
data$dp_cas[data$hour<=8]=1
data$dp_cas[data$hour==9]=2
data$dp_cas[data$hour>=10 & data$hour<=19]=3
data$dp_cas[data$hour>19]=4

In [None]:
# 온도 특성 파악
f=rpart(registered~temp,data=train)
f
fancyRpartPlot(f)

In [None]:
data$temp_reg=0
data$temp_reg[data$temp<13]=1
data$temp_reg[data$temp>=13 & data$temp<23]=2
data$temp_reg[data$temp>=23 & data$temp<30]=3
data$temp_reg[data$temp>=30]=4

In [None]:
f=rpart(casual~temp,data=train)
f
fancyRpartPlot(f)

In [None]:
data$temp_cas=0
data$temp_cas[data$temp<15]=1
data$temp_cas[data$temp>=15 & data$temp<23]=2
data$temp_cas[data$temp>=23 & data$temp<30]=3
data$temp_cas[data$temp>=30]=4

In [None]:
#년도 분류
data$month=as.numeric(substr(data$datetime,6,7))
data$year_part[data$year=='2011']=1
data$year_part[data$year=='2011' & data$month>3]=2
data$year_part[data$year=='2011' & data$month>6]=3
data$year_part[data$year=='2011' & data$month>9]=4
data$year_part[data$year=='2012']=5
data$year_part[data$year=='2012' & data$month>3]=6
data$year_part[data$year=='2012' & data$month>6]=7
data$year_part[data$year=='2012' & data$month>9]=8
table(data$year_part)

In [None]:
#day_type 특성추가
data$day_type=0
data$day_type[data$holiday==0 & data$workingday==0]="weekend"
data$day_type[data$holiday==1]="holiday"
data$day_type[data$holiday==0 & data$workingday==1]="working day"

In [None]:
train=data[as.integer(substr(data$datetime,9,10))<20,]
test=data[as.integer(substr(data$datetime,9,10))>19,]

In [None]:
plot(train$temp,train$count)

In [None]:
table(data$windspeed==0)

In [None]:
# 바람 결측치 확인
k=data$windspeed==0
wind_0=subset(data,k)
wind_1=subset(data,!k)

In [None]:
# 바람 예측치 생성
set.seed(415)
fit <- randomForest(windspeed ~ season+weather +humidity +month+temp+ year+atemp, data=wind_1,importance=TRUE, ntree=250)
pred=predict(fit,wind_0)
wind_0$windspeed=pred

In [None]:
# 데이터 합치기
data=rbind(wind_0,wind_1)

In [None]:
# 주말 플래그 생성
data$weekend=0
data$weekend[data$day=="Sunday" | data$day=="Saturday"]=1
str(data)

In [None]:
# randmoforest 모델을 위해 범주형 데이터는 전부 factor 변수로 변환한다
data$season=as.factor(data$season)
data$holiday=as.factor(data$holiday)
data$workingday=as.factor(data$workingday)
data$weather=as.factor(data$weather)
data$hour=as.factor(data$hour)
data$month=as.factor(data$month)
data$day_part=as.factor(data$dp_cas)
data$day_type=as.factor(data$dp_reg)
data$day=as.factor(data$day)
data$temp_cas=as.factor(data$temp_cas)
data$temp_reg=as.factor(data$temp_reg)

In [None]:
train=data[as.integer(substr(data$datetime,9,10))<20,]
test=data[as.integer(substr(data$datetime,9,10))>19,]

In [None]:
# 예측 데이터의 log 처리
train$reg1=train$registered+1
train$cas1=train$casual+1
train$logcas=log(train$cas1)
train$logreg=log(train$reg1)
test$logreg=0
test$logcas=0

In [None]:
boxplot(train$logreg~train$weather,xlab="weather", ylab="registered users")

In [None]:
boxplot(train$logreg~train$season,xlab="season", ylab="registered users")

In [None]:
# registered 유저 모델
set.seed(415)
fit1 <- randomForest(logreg ~ hour +workingday+day+holiday+ day_type +temp_reg+humidity+atemp+windspeed+season+weather+dp_reg+weekend+year+year_part, data=train,importance=TRUE, ntree=250)

pred1=predict(fit1,test)
test$logreg=pred1

In [None]:
# casual 유저 모델
set.seed(415)
fit2 <- randomForest(logcas ~hour + day_type+day+humidity+atemp+temp_cas+windspeed+season+weather+holiday+workingday+dp_cas+weekend+year+year_part, data=train,importance=TRUE, ntree=250)

pred2=predict(fit2,test)
test$logcas=pred2

In [None]:
# 예측 값이 로그 처리 되었기에 원래 값으로 변경
test$registered=exp(test$logreg)-1
test$casual=exp(test$logcas)-1
test$count=test$casual+test$registered

In [None]:
#제출 파일 저장
s<-data.frame(datetime=test$datetime,count=test$count)
write.csv(s,file="submit.csv",row.names=FALSE)

In [None]:
download.csv <- function(dataframe) {
    csv.vector <- capture.output(write.csv(dataframe, row.names=F, quote = TRUE))
    csv.string <- paste(csv.vector, collapse="\n")
    csv.base64 <- as.character(RCurl::base64(csv.string))
    csv.html <- paste("<a download=\"dataframe.csv\" href=\"data:text/csv;base64,", csv.base64, "\" target=\"_blank\">Download CSV</a>", sep="")
    IRdisplay::display_html(csv.html)
}
download.csv(s)