-
Notifications
You must be signed in to change notification settings - Fork 0
/
returns_1.R
143 lines (98 loc) · 3.93 KB
/
returns_1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
install.packages("gitignore")
library(gitignore)
# print git ignore template
gi_fetch_templates("R")
#note: generate personal access token to push repo
install.packages("writexl")
# load tidyverse and it's libraries
library(tidyverse)
library(readxl)
library(dplyr)
library(tidyr)
library(writexl)
# for data cleaning
library(janitor)
# check working directory
getwd()
# check the excel sheet names from excel file
readxl::excel_sheets("/Users/marie-louise/Documents/sites/quarterly_release_y:e_sept_20/returns-datasets-sep-2020.xlsx")
# dataframe_name <- readxl::read_excel(file_path)
# or - readxl::excel_sheets(file_path)
raw_data_1 <- readxl::read_excel("/Users/marie-louise/Documents/sites/quarterly_release_y:e_sept_20/returns-datasets-sep-2020.xlsx",
sheet = "Data - Ret_D01")
# open file in viewer
View(raw_data_1)
# clean column names (use gsub on work laptop)
clean_raw_data <- janitor::clean_names(raw_data_1)
# open file in viewer
View(clean_raw_data)
# inspect data
# check column / field names
names(clean_raw_data)
# filter data
# %>% pipe operator, shortcut = Cmd + Shift + M, to chain sequences together
filtered_data <- clean_raw_data %>%
# select variables / column names
select(quarter,
return_type_group,
return_type,
number_of_returns) %>%
# filter on all the selected vars, check if any vars meet the condition, detect the pattern in a string
filter_all(any_vars(str_detect(.,pattern = ('2010|2011|2012|2013|2014|2015|2016|2017|2018|2019|2020'))))
# View filtered data
View(filtered_data)
# double check var names
names(filtered_data)
# Manual checking:
# for each column/ var except number_of_returns , use unique() to eliminate duplicate values
# use $ to access var name from inside the list (instead of [1])
unique(filtered_data$return_type_group)
unique(filtered_data$quarter)
unique(filtered_data$return_type)
# In return_type "Non-detained enforced removals" and "Non-detained Enforced removals"
# are similar except one has a capital E
# use mutate(), case_when() vectorises multiple if else statements
# ~ is used to separate tow arguments in a function
mutated_data <- filtered_data %>%
mutate(return_type=case_when(return_type=="Non-detained enforced removals" ~ "Non-detained Enforced removals",
TRUE~ return_type))
# double check, there should be "Non-detained Enforced removals" with 7 other vars
unique(mutated_data$return_type)
dplyr::count(mutated_data, return_type[1], sort = TRUE)
dplyr::summarise(.data = mutated_data,
total_rows = sum(return_type, na.rm = TRUE))
# check names
names(mutated_data)
View(mutated_data)
# filter 1
filter_1 <- mutated_data %>%
select( quarter,return_type_group, number_of_returns) %>%
group_by(quarter,return_type_group) %>%
summarise(Total = sum(number_of_returns))
View(filter_1)
# in the summary tables, two columns / vars have been merged together (return_type_group and return_type )
column_sums_1 <- mutated_data %>%
# group_by() to take existing table and group using the defined vars
group_by(quarter,return_type_group) %>%
# summarise() to group data created by group_by()
summarise(values = sum(number_of_returns)) %>%
# Rename the label return_type_group to row_labels
rename(row_labels = return_type_group)
View(column_sums_1)
# Find the sum of quarter,return_type
# return_type is a subset of return_type_group
column_sums_2 <- mutated_data %>%
group_by(quarter,return_type) %>%
summarise(values = sum(number_of_returns)) %>%
# Rename the label return_type to row_labels
rename(row_labels= return_type)
View(column_sums_2)
# combine both data sets
new_data <- union(column_sums_1,column_sums_2)
View(new_data)
# spread() to reshape the data and convert the quarter row into a column
formatted_data <- new_data %>%
spread(quarter, values)
View(formatted_data)
# export the data
write_xlsx(formatted_data, "summary_data_R.xlsx")