-
Notifications
You must be signed in to change notification settings - Fork 1
/
data_cleaned.Rmd
97 lines (83 loc) · 2.38 KB
/
data_cleaned.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
---
title: "Data cleaning Process"
author: "Qixiang Chen & Paulina Han"
date: "11/27/2021"
output: html_document
---
```{r}
library(tidyverse)
library(ggplot2)
library(rvest)
library(plotly)
library(ggridges)
library(readxl)
library(patchwork)
library(httr)
library(lubridate)
library(p8105.datasets)
library(rnoaa)
```
```{r}
#load in the data
dataframe = read_csv("data/Bus_Breakdown_and_Delays.csv")%>%
janitor::clean_names()
```
```{r}
#Adding data variable
df_date_sep =
dataframe %>%
separate(occurred_on, into = c("occur_date", "occur_time"), sep = 11) %>%
mutate(happen_date = occur_date,
date_noYear = occur_date) %>%
separate(happen_date, into = c("month", "day2", "year"), sep = "/") %>%
separate(date_noYear, into = c("date_noYear", "year2"), sep = 5) %>%
select(-day2) %>%
select(-year2) %>%
mutate(month = case_when(month == "01" ~ "January",
month == "02" ~ "February",
month == "03" ~ "March",
month == "04" ~ "April",
month == "05" ~ "May",
month == "06" ~ "June",
month == "07" ~ "July",
month == "08" ~ "August",
month == "09" ~ "September",
month == "10" ~ "October",
month == "11" ~ "November",
month == "12" ~ "December")
) %>%
relocate(year, occur_date, month, date_noYear)
df_date_sep$occur_date = lubridate::mdy(df_date_sep$occur_date)
Sys.setlocale("LC_TIME", "English")
df_date_sep$day = weekdays(df_date_sep$occur_date)
```
```{r}
#manhanttan data only
df_date_manhattan =
df_date_sep %>%
filter(boro == "Manhattan")
```
```{r}
#selecting Central Park weather as a represent of Manhattan from ronaa dataset
weather_df =
rnoaa::meteo_pull_monitors(
c("USW00094728"),
var = c("PRCP", "SNOW", "SNWD", "TMAX", "TMIN"),
date_min = "2018-09-05",
date_max = "2021-11-18") %>%
mutate(
name = recode(
id,USW00094728 = "CentralPark_NY"),
tmin = tmin / 10,
tmax = tmax / 10) %>%
rename(
occur_date = date
) %>%
select(name, id, everything())
```
```{r}
#final data set
bus_weather_df = left_join(df_date_manhattan, weather_df, by = "occur_date")
#write in a csv
write_csv(bus_weather_df,"data/clean_data.csv")
```