/
034.R
214 lines (166 loc) · 8.03 KB
/
034.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
# !!! Run once if you haven't already installed:
install.packages("Hmisc") # miscellaneous statistical functions
# Run every time
library(tidyverse)
library(magrittr)
# Datasets from The Analysis of Biological Data by Whitlock and Schluter
# https://whitlockschluter.zoology.ubc.ca/data
hemoglobin_frame <- read.csv(file="https://whitlockschluter.zoology.ubc.ca/wp-content/data/chapter02/chap02e3cHumanHemoglobinElevation.csv")
hemoglobin_frame$population <- as.factor(hemoglobin_frame$population)
usa_hemoglobin <- filter(hemoglobin_frame, population == "USA")
# ---------------
# Visualizing distributions
# ---------------
# Histograms are a variation on bar plots that understand "binning"
# Bins are used to assign the data into size categories
ggplot(usa_hemoglobin, aes(hemoglobin)) + # can just list dataset rather than "data = dataset"
geom_histogram(binwidth = 0.1)
ggplot(usa_hemoglobin, aes(hemoglobin)) +
geom_histogram(bins = 50) # 30 bins is the default
ggplot(usa_hemoglobin, aes(hemoglobin)) +
geom_histogram(binwidth = 0.5)
ggplot(usa_hemoglobin, aes(hemoglobin)) +
geom_histogram(binwidth = 1)
# Can use color aesthetic to plot multiple distributions
ggplot(hemoglobin_frame, aes(hemoglobin)) +
geom_histogram(aes(color = population), fill = "NA", binwidth = 0.5) # NA makes the bars transparent
# Density plot is better when there are plenty of data and the curve is smooth
# plot a single level (US)
ggplot(usa_hemoglobin, aes(hemoglobin)) +
geom_density()
# plot multiple levels of the factor
ggplot(hemoglobin_frame, aes(hemoglobin)) +
geom_density(alpha = 0.2, aes(fill = population, color = population)) # use alpha to control transparency
# Normal quantile (Q-Q) plot is probably a better way to visualize differences in distributions
ggplot(hemoglobin_frame, aes(sample = hemoglobin)) +
geom_qq(aes(color = population)) +
stat_qq_line(aes(color = population))
# Variations on box plots
ggplot(hemoglobin_frame, aes(population, hemoglobin)) +
geom_boxplot()
ggplot(hemoglobin_frame, aes(population, hemoglobin)) +
geom_boxplot(aes(color = population))
ggplot(hemoglobin_frame, aes(population, hemoglobin)) +
geom_violin()
# Dot plot not so great for this many points
ggplot(hemoglobin_frame, aes(x = population, y = hemoglobin)) +
geom_dotplot(binaxis = "y", stackdir = "center", dotsize = .5, binwidth = .25, aes(color = population, fill = population))
# ---------------
# Preparing some data to play with
# ---------------
erg_frame <- read.csv(file="https://raw.githubusercontent.com/HeardLibrary/digital-scholarship/master/data/r/color-anova-example.csv")
erg_frame$color <- as.factor(erg_frame$color) # convert from character strings to factor (categories)
erg_frame$block <- as.factor(erg_frame$block) # convert from character strings to factor (categories)
# Calculate the mean for each color
erg_mean_frame <- erg_frame %>%
group_by(color) %>% # group_by() is from the dplyr package
summarise(mean_response = mean(response)) # summarise() also from dplyr
# ---------------
# Understanding "shortcut" geoms and the stat argument
# ---------------
# Most general graph type
ggplot(erg_mean_frame, aes(x=color, y=mean_response)) +
layer(
mapping = NULL,
data = NULL,
geom = "bar",
stat = "identity",
position = "identity"
)
# geom_bar defaults the layer to "bar" geom and all other arguments except stat
ggplot(erg_mean_frame, aes(x=color, y=mean_response)) +
geom_bar(
stat="identity"
)
# geom_col defaults the stat argument in geom_bar to "identity"
ggplot(erg_mean_frame, aes(x=color, y=mean_response)) +
geom_col(
)
# ---------------
# Displaying experimental differences
# ---------------
# Approach #1: Use separately-created stats to create a column plot using the individual mean values
ggplot(erg_mean_frame, aes(x = color, y = mean_response)) +
geom_col()
# Approach #2: Calculate the stats on the fly from the raw (un-summarized) data
# as the plot is created using "stat" argument
ggplot(erg_frame, aes(x = color, y = response)) +
geom_bar(stat = "summary", fun = "mean") # stat = "summary" overrides the default counting method
# Use a different summary function.
ggplot(erg_frame, aes(x = color, y = response)) +
geom_bar(stat = "summary", fun = "median")
# Illustration of "count" stat
ggplot(erg_frame, aes(x = color)) +
geom_bar(stat = "count")
ggplot(erg_frame, aes(x = color)) +
geom_bar()
# Adding error bars to a bar plot
# In this case, we are using 95% confidence limits (cl)
# NOTE: cl is a measure of the uncertainty of the mean, NOT the variability of the data as in std. dev.
# miscelaneous stat functions
library("Hmisc")
# mean_cl_normal() calculates the mean and upper and lower 95% confidence intervals for a vector
# See https://rdrr.io/cran/Hmisc/man/smean.sd.html for details
erg_cl_frame <- erg_frame %>%
group_by(color) %>% # group_by() is from the dplyr package
summarise(cl_response = mean_cl_normal(response)) # summarise() also from dplyr
# Output is apparently a tibble
erg_cl_frame
# But effectively it's a list of a factor vector (color) and a dataframe (cl_response)
erg_cl_frame$color
erg_cl_frame$cl_response
# We can access columns in the dataframe using this weird notation (column in a dataframe in a list)
erg_cl_frame$cl_response$ymin
# put means and cl's into a simple data frame that can be understood by ggplot
graphing_data <- data.frame(mean = erg_cl_frame$cl_response$y, lower_cl = erg_cl_frame$cl_response$ymin, upper_cl = erg_cl_frame$cl_response$ymax, color = erg_cl_frame$color)
# identity stat leaves the data the same. Normally geom_bar uses "count" as the stat
ggplot(graphing_data, aes(x=color, y=mean)) +
geom_bar(stat="identity", aes(fill = color)) + # same as geom_col(aes(fill = color))
geom_errorbar(aes(ymin = lower_cl, ymax = upper_cl),
width=.2) +
scale_fill_manual(values=c("blue", "green", "red")) +
labs(
x = "color of incident light",
y = "electroretinogram response (mV)",
title = "Sensitivity of cockroach eyes to varying colors (error bars: 95% confidence limits)"
) +
theme(legend.position = "none")
# ---------------
# Statistically calculated line geoms
# ---------------
# Although not explicitly given as a "stat" argument, the geom_smooth is performing a stat
# function by summarizing a set of data using a statistical method.
# Data from https://whitlockschluter.zoology.ubc.ca/data/chapter17
lion_noses <- read.csv(file="https://whitlockschluter.zoology.ubc.ca/wp-content/data/chapter17/chap17e1LionNoses.csv")
# geom_smooth defaults to the "loess" stat for curve fitting if < 1000 points
ggplot(lion_noses, mapping = aes(x = proportionBlack, y = ageInYears)) +
geom_point() +
geom_smooth() +
labs(y="age (years)", x = "proportion black")
# Suppress the standard error range around the curve
ggplot(lion_noses, mapping = aes(x = proportionBlack, y = ageInYears)) +
geom_point() +
geom_smooth(se = FALSE) +
labs(y="age (years)", x = "proportion black")
# Linear model stat produces best fit line (i.e. defaults to y~x)
ggplot(lion_noses, mapping = aes(x = proportionBlack, y = ageInYears)) +
geom_point() +
geom_smooth(method = "lm") +
labs(y="age (years)", x = "proportion black")
# Display line without SE range
ggplot(lion_noses, mapping = aes(x = proportionBlack, y = ageInYears)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(y="age (years)", x = "proportion black")
# Use second-order polynomial (quadratic) fit in linear model instead of line
# I did not find an obvious list explaining the possible range of formulas that can be used.
# But Googling/Stack Overflow will probably allow you to find the function you want.
ggplot(lion_noses, mapping = aes(x = proportionBlack, y = ageInYears)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ poly(x, 2)) +
labs(y="age (years)", x = "proportion black")
# Another example: fit an exponential curve through the data
ggplot(lion_noses, mapping = aes(x = proportionBlack, y = ageInYears)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ exp(x)) +
labs(y="age (years)", x = "proportion black")