/
0-process-swissgis-data.R
95 lines (76 loc) · 2.78 KB
/
0-process-swissgis-data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# ----------------------------------------------------------------------------------------
# Authors: GeoBeer Team, @geobeer_ch
# Created: 2019-03
# License: GNU General Public License v3.0
# Repository: https://github.com/GeoBeer/geobeer-analytics
# More info: https://geobeer.github.io/geobeer-analytics, http://www.geobeer.ch
# ----------------------------------------------------------------------------------------
if (!require(here)) {
install.packages('here')
require(here)
}
if (!require(tidyverse)) {
install.packages('tidyverse')
require(tidyverse)
}
if (!require(magrittr)) {
install.packages('magrittr')
require(magrittr)
}
if (!require(readr)) {
install.packages('readr')
require(readr)
}
if (!require(gender)) {
install.packages('gender')
require(gender)
}
if (!require(genderdata)) {
install.packages('genderdata')
require(genderdata)
}
source(here('functions.R'))
setwd(here('..', 'geobeer-private-data', 'SwissGIS'))
# Read all data files
data <- read_delim('SwissGIS-user-table.csv', delim=';')
# Restructure data
names(data) <- c('name','account','profile_img','language','account_age',
'followers','following','fo_fr_ratio','group_followers',
'group_following','group_fo_fr_ratio','tweets',
'tweets_per_month')
data <- data %>%
# Add timespan information for using gender_df() later on
mutate(min_year=1950, max_year=2000)
fn_name_from_fullname <- function(x) {
unlist(strsplit(x, " "))[1]
}
data$firstname <- as.character(lapply(data$name, fn_name_from_fullname))
# Classify the gender based on first name
gender_data <- gender_df(data, name_col="firstname",
year_col=c("min_year", "max_year")) %>%
select(c(name, gender))
# Join gender information to main data set.
data %<>%
left_join(gender_data, by = c("firstname"="name"))
not_automatically_classifiable <- data %>%
filter(is.na(gender))
# Drop unnecessary columns from data
data$min_year <- NULL
data$max_year <- NULL
# Classify the additional names using a manually curated firstname > gender list
aggregated_data <- classify_additional_names(data)
# Set unclassified records to 'unknown or unapplicable' (most of them are
# companies and other institutions)
aggregated_data$gender[is.na(aggregated_data$gender)] <-
'unknown or unapplicable'
# Remove temporary data
rm(gender_data)
# Save resulting data to disks
write_csv(aggregated_data, here('..', 'geobeer-private-data', 'SwissGIS',
'swissgis-aggregated-data.csv'))
# Analyse gender by event and save to disk
gender_stats_data <- aggregated_data %>%
group_by(gender) %>%
summarise (count = n()) %>%
mutate(percentage = count / sum(count) * 100)
write_csv(gender_stats_data, here('Results', 'swissgis-gender-stats.csv'))