/
wrangle-nls-answers.R
189 lines (156 loc) · 8.66 KB
/
wrangle-nls-answers.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# ------------------
# Assignment #1: Select subset of responses and merge datasets
# ------------------
# We want to pull out the following responses from the two listed datasets,
# then merge them into one. Page numbers are from the x-Frequencies.pdf document
# where x is the dataset identifier.
# DS0001
# AID Unique participant identifier
# IYEAR YEAR OF INTERVIEW-W1 p. 3 no missing values
# BIO_SEX BIOLOGICAL SEX-W1 p. 4 missing value 6 1=male, 2=female
# H1GI1Y S1Q1 BIRTH YEAR-W1 p. 6 missing value 96
# H1PF1 S18Q1 Mom warm and loving p. 364 missing values 6-8
# H1PF2 S18Q2 Mom encurages independence p. 364 missing values 6-8
# H1PF3 S18Q3 Mom discusses ethics p. 365 missing values 6-9
# H1PF4 S18Q4 Mom good communication p. 365 missing values 6-9
# H1PF5 S18Q5 Mom good relationship p. 366 missing values 6-9
# H1PF23 S18Q23 Dad warm and loving p. 375 missing values 6-9
# H1PF24 S18Q24 Dad good communication p. 376 missing values 6-9
# H1PF25 S18Q25 Dad good relationship p. 376 missing values 6-9
#
# DS0022
# AID Unique participant identifier
# H4TO1 S23Q1 Ever smoked entire cigarette p. 399 missing values 6-8
# H4TO2 S23Q2 Age 1st smoked entire cigarette p. 399 missing values 96-98
# H4TO3 S23Q3 Ever smoked cigs regularly p. 400 missing values 6-8
# H4TO4 S23Q4 Age smoled cigs regularly p. 401 missing values 96-98
# H4TO5 S23Q5 Num days smokd cigs lst 30 days p. 402
# H4GH6 S4Q6 Weight lbs p. 50 missing values 996 and 998
# H4GH5F S4Q5F Height feet p. 49 missing value 98
# H4GH5I S4Q5I Height inches p. 49 missing value 98
# load necessary packages
# library(tidyverse) # use if you installed the whole tidyverse
library(readr) # for reading tibbles
library(dplyr) # tibble manipulation
# set working directory to location of downloaded files
setwd("~/nls_data/class/")
# read in the two datasets and create tibbles for them
closeness <- read_tsv("21600-0001-Data.tsv")
physical <- read_tsv("21600-0022-Data.tsv")
# extract columns for analysis from big tibbles into small tibbles
small_closeness <- select(closeness, AID, BIO_SEX, IYEAR, H1GI1Y, H1PF1:H1PF5, H1PF23:H1PF25)
small_physical <- select(physical, AID, H4TO1:H4TO5, H4GH6, H4GH5F, H4GH5I)
# perform outer join to merge the two small tibbles
combined_outer_join <- full_join(small_closeness, small_physical, by = c("AID"), copy = FALSE, suffix = c(".close", ".phys") )
head(combined_outer_join)
combined_inner_join <- inner_join(small_closeness, small_physical, by = c("AID"), copy = FALSE, suffix = c(".close", ".phys") )
head(combined_inner_join)
# save files to avoid having to load the big datasets into memory
write_csv(combined_outer_join, "outer_raw.csv", na = "NA", append = FALSE, col_names = TRUE, quote_escape = "double")
write_csv(combined_inner_join, "inner_raw.csv", na = "NA", append = FALSE, col_names = TRUE, quote_escape = "double")
# ------------------
# Assignment #2: Calculate a BMI column
# ------------------
# Easiest to do by creating a column for height in meters and column for mass in kg first.
# Start here if already saved small files
# BMI formula is mass in kg divided by (height in m squared)
# library(tidyverse) # use if you installed the whole tidyverse
library(readr) # for reading tibbles
library(dplyr) # tibble manipulation
setwd("~/nls_data/class/")
# load data
inner_raw <- read_csv("inner_raw.csv")
# examine weight data
summary(inner_raw$H4GH6)
no_na <- mutate(inner_raw, H4GH6 = replace(H4GH6, H4GH6>=990, NA))
summary(no_na$H4GH6)
# calculate change missing values to NA, then calculate BMI
bmi_tibble <- inner_raw %>%
# replace missing data numbers with NA
mutate(H1GI1Y = replace(H1GI1Y, H1GI1Y==96, NA)) %>% # birth year missing value 96
mutate(BIO_SEX = replace(BIO_SEX, BIO_SEX==6, NA)) %>% # biological sex missing value 6
mutate(BIO_SEX = replace(BIO_SEX, BIO_SEX==1, "male")) %>% # biological sex missing value 6
mutate(BIO_SEX = replace(BIO_SEX, BIO_SEX==2, "female")) %>% # biological sex missing value 6
mutate(H4TO1 = replace(H4TO1, H4TO1>=6, NA)) %>% # smoked a cigarette? missing value 6-8
mutate(H4TO2 = replace(H4TO2, H4TO2>=96, NA)) %>% # age 1st smoked missing value 96-98
mutate(H4TO3 = replace(H4TO3, H4TO3>=6, NA)) %>% # smoked regularly? missing value 6-8
mutate(H4TO4 = replace(H4TO4, H4TO4>=96, NA)) %>% # age smoked regularly missing values 96-98
mutate(H4TO5 = replace(H4TO5, H4TO5>=32, NA)) %>% # number days/month smoked missing values greater than 31
mutate(H4GH6 = replace(H4GH6, H4GH6>=990, NA)) %>% # weight missing values 996 and 998
mutate(H4GH5F = replace(H4GH5F, H4GH5F==98, NA)) %>% # feet missing value 98
mutate(H4GH5I = replace(H4GH5I, H4GH5I==98, NA)) %>% # inches missing value 98
# calculate age
mutate(age = IYEAR - H1GI1Y) %>% # interview year minus birth year
# calulate the BMI value
mutate(height_m = H4GH5F*12 + H4GH5I) %>% # convert feet and inches to inches
mutate(height_m = height_m*2.54/100) %>% # convert inches to meters
mutate(mass_kg = H4GH6*0.453592) %>% # convert pounds to kg
mutate(bmi = mass_kg/height_m^2) # calculate BMI
# ------------------
# Assignment #3: Calculate a maternal closeness column
# ------------------
# change missing values to NA, then calculate maternal closeness
with_maternal <- bmi_tibble %>%
mutate(H1PF1 = replace(H1PF1, H1PF1>=6, NA)) %>%
mutate(H1PF2 = replace(H1PF2, H1PF2>=6, NA)) %>%
mutate(H1PF3 = replace(H1PF3, H1PF3>=6, NA)) %>%
mutate(H1PF4 = replace(H1PF4, H1PF4>=6, NA)) %>%
mutate(H1PF5 = replace(H1PF5, H1PF5>=6, NA)) %>%
mutate(H1PF23 = replace(H1PF23, H1PF23>=6, NA)) %>%
mutate(H1PF24 = replace(H1PF24, H1PF24>=6, NA)) %>%
mutate(H1PF25 = replace(H1PF25, H1PF25>=6, NA)) %>%
mutate(mc_H1PF1 = ifelse(!H1PF1==1 & !is.na(H1PF1), 0, ifelse(H1PF1==1, 1, NA))) %>%
mutate(mc_H1PF2 = ifelse(!H1PF2==1 & !is.na(H1PF2), 0, ifelse(H1PF2==1, 1, NA))) %>%
mutate(mc_H1PF3 = ifelse(!H1PF3==1 & !is.na(H1PF3), 0, ifelse(H1PF3==1, 1, NA))) %>%
mutate(mc_H1PF4 = ifelse(!H1PF4==1 & !is.na(H1PF4), 0, ifelse(H1PF4==1, 1, NA))) %>%
mutate(mc_H1PF5 = ifelse(!H1PF5==1 & !is.na(H1PF5), 0, ifelse(H1PF5==1, 1, NA))) %>%
mutate(maternal_closeness = ifelse(mc_H1PF1==1 & mc_H1PF2==1 & mc_H1PF3==1 & mc_H1PF4==1 & mc_H1PF5==1, 1, ifelse(is.na(H1PF1) | is.na(H1PF2) | is.na(H1PF3) | is.na(H1PF4) | is.na(H1PF5), NA, 0)))
write_csv(with_maternal, "mutated_data.csv", na = "NA", append = FALSE, col_names = TRUE, quote_escape = "double")
# ------------------
# Assignment #4: Relationship between age and height
# ------------------
library(ggplot2)
library(readr) # for reading tibbles
setwd("~/nls_data/class/")
with_maternal <- read_csv("mutated_data.csv")
# plot the data using regular plot, calculate and plot trendline, summarize stats
plot(with_maternal$height_m ~ with_maternal$age)
model <- lm(with_maternal$height_m ~ with_maternal$age)
abline(model)
summary(model)
# use ggplot to plot height vs. age, colored by sex with smooth geom
ggplot(data = with_maternal) +
geom_point(mapping = aes(x = age, y = height_m, color = BIO_SEX), position = position_dodge(width = .3)) +
geom_smooth(mapping = aes(x = age, y = height_m, color = BIO_SEX)) +
labs(y="height (m)", x = "age (years)", color = "sex")
# ------------------
# Assignment #5: Relationship between maternal closeness and smoking
# ------------------
# Relationship between age of starting smoking and closeness
# create a factor out of closeness, box and whisker plot, then t-test of means
closeness_factor <- factor(with_maternal$maternal_closeness,
levels = c(0, 1),
labels = c("distant", "close"))
plot(with_maternal$H4TO2 ~ closeness_factor)
t.test(with_maternal$H4TO2 ~ closeness_factor, var.equal=TRUE, conf.level=0.95)
# Relationship between number of days/month smoking and closeness
plot(with_maternal$H4TO5 ~ closeness_factor)
t.test(with_maternal$H4TO5 ~ closeness_factor, var.equal=TRUE, conf.level=0.95)
# ------------------
# Assignment #6: Relationship between smoking, weight, and BMI
# ------------------
# Relationship between number of days/month smoking and mass (plot, trendline, summar)
plot(with_maternal$mass_kg ~ with_maternal$H4TO5)
model <- lm(with_maternal$mass_kg ~ with_maternal$H4TO5)
abline(model)
summary(model)
# Relationship between number of days/month smoking and BMI
plot(with_maternal$bmi ~ with_maternal$H4TO5)
model <- lm(with_maternal$bmi ~ with_maternal$H4TO5)
abline(model)
summary(model)
# ggplot viz: does the effect of monthly smoking on BMI have a different effect on males than females?
ggplot(data = with_maternal) +
geom_point(mapping = aes(x = H4TO5, y = bmi, color = BIO_SEX), position = position_dodge(width = .3)) +
geom_smooth(mapping = aes(x = H4TO5, y = bmi, color = BIO_SEX)) +
labs(y="body mass index (BMI)", x = "days smoked per month", color = "sex")