# **Install and load packages**

In [None]:
install.packages("tidyverse")
install.packages("skimr")
install.packages("janitor")
install.packages("plyr")
install.packages("tidyr")
install.packages("ggplot2")
install.packages("lubridate")

library(tidyverse)
library(skimr)
library(janitor)
library(plyr)
library(tidyr)
library(ggplot2)
library(lubridate)

# ****Collecting data and creating variables for each month

In [None]:
month_1 <- read_csv("202101-divvy-tripdata.csv")
month_2 <- read_csv("202102-divvy-tripdata.csv")
month_3 <- read_csv("202103-divvy-tripdata.csv")
month_4 <- read_csv("202104-divvy-tripdata.csv")
month_5 <- read_csv("202105-divvy-tripdata.csv")
month_6 <- read_csv("202106-divvy-tripdata.csv")
month_7 <- read_csv("202107-divvy-tripdata.csv")
month_8 <- read_csv("202108-divvy-tripdata.csv")
month_9 <- read_csv("202109-divvy-tripdata.csv")
month_10 <- read_csv("202110-divvy-tripdata.csv")
month_11 <- read_csv("202111-divvy-tripdata.csv")
month_12 <- read_csv("202112-divvy-tripdata.csv")

# **Merging all the previous months in one single dataset and cleaning the data**

In [None]:
alldata= rbind(month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12)
View(alldata)

##Adding two new columns for the ride length and the day of the week.

new_data<- alldata %>% 
  mutate(alldata,ride_lenght = ended_at-started_at,day_of_week=weekdays(started_at))

#Checking the column names to make sure they were created properly
colnames(new_data)

#checking if there are any starting date before 2021-01-01.

min_ride <- min(new_data$started_at)
head(min_ride)

#Checking the maximum end date shows a date exceeding the year in which we are working on

max_ride<- max(new_data$ended_at)

#Displaying the maximum ride to decide if it's an outlier

new_data[new_data$ended_at == max(new_data$ended_at), ]

#We can see that the the bike was used on 2021-12-31 and returned on 2022-01-03 therefor this data is valid

In [None]:
#Creating a new data by excluding the NAs
new_data1 <- new_data %>% 
  drop_na()

#making sure that the data was removed by running a summary check
summary(new_data1)

#Double checking if there is any NA left
any(is.na(new_data1))

# we can see a negative value which needs to be removed

new_data1[new_data1$ride_lenght == min(new_data1$ride_lenght), ]

#We will remove this wrong value

min(new_data1$ride_lenght)

 new_data2 <- new_data1 %>%
            filter(ride_lenght > 0)

#Verifying if the negative value was removed
summary(new_data2)

```

In [None]:
# Creating new columns for the date,year,month, day and hour
new_data2$date <- as.Date(new_data2$started_at) 
new_data2$month <- format(as.Date(new_data2$date), "%m")
new_data2$day <- format(as.Date(new_data2$date), "%d")
new_data2$year <- format(as.Date(new_data2$date), "%Y")
new_data2$hour<- format(as.POSIXct(new_data2$started_at), format = "%H")
View(new_data2)

In [None]:
new_data2$ride_lenght <- as.numeric(as.difftime(new_data2$ride_lenght), units="secs")
str(new_data2)
mean(new_data2$ride_lenght/60)
median(new_data2$ride_lenght/60)
max(new_data2$ride_lenght/60)
min(new_data2$ride_lenght)

new_data2[new_data2$ride_lenght == min(new_data2$ride_lenght), ]


summary(new_data2)
mean(new_data2$ride_lenght)
median(new_data2$ride_lenght)
max(new_data2$ride_lenght)
min(new_data2$ride_lenght)
is.numeric(new_data2$ride_lenght)

```


```{r}

str(new_data2)

aggregate(new_data2$ride_lenght ~ new_data2$member_casual, FUN = mean)
aggregate(new_data2$ride_lenght ~ new_data2$member_casual, FUN = median)
aggregate(new_data2$ride_lenght ~ new_data2$member_casual, FUN = max)
aggregate(new_data2$ride_lenght ~ new_data2$member_casual, FUN = min)

# See the average ride time by each day for members vs casual users
aggregate(new_data2$ride_lenght ~ new_data2$member_casual + new_data2$day_of_week, FUN = mean)
new_data2$day_of_week <- ordered(new_data2$day_of_week, levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))

aggregate(new_data2$ride_lenght ~ new_data1$member_casual + new_data2$day_of_week, FUN = mean)

aggregate(new_data2$ride_lenght ~ new_data2$member_casual + new_data2$hour, FUN = mean)

View(new_data2)


counts <- aggregate(new_data2$ride_lenght ~ new_data1$member_casual + new_data2$day_of_week, FUN = mean)

write.csv(counts, file = ("C:/Users/naili/OneDrive/Documents/R.csv"))

counts2 <- aggregate(new_data2$ride_lenght ~ new_data2$member_casual + new_data2$date, FUN = mean)

write.csv(counts2, file = ("C:/Users/naili/OneDrive/Documents/R.csv"))

counts3 <- aggregate(new_data2$ride_lenght ~ new_data1$member_casual + new_data2$month, FUN = mean)

write.csv(counts3, file = ("C:/Users/naili/OneDrive/Documents/R.csv"))

counts4 <- aggregate(new_data2$ride_lenght ~ new_data1$member_casual + new_data2$hour, FUN = mean)

write.csv(counts4, file = ("C:/Users/naili/OneDrive/Documents/R.csv"))

counts6 <- aggregate(new_data2$ride_lenght ~ new_data2$member_casual + new_data2$end_station_id, FUN = mean)

write.csv(counts6, file = ("C:/Users/naili/OneDrive/Documents/R.csv"))
