/
9-3 exercise.Rmd
157 lines (112 loc) · 4.37 KB
/
9-3 exercise.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
---
title: "Class 9-3"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(dplyr)
library(broom)
library(mosaic)
library(caret)
set.seed(370)
# generate dataset
# how much food does each family need per week
# 5lbs per person
# with some random variation based - say different ppl need different amounts
family_sizes = c(1,2,2,2,3,3,3,4,4,5,5,5,6)
refugee_food_needs <- tibble::data_frame(
family_size = sample(family_sizes, n=100), #sort(runif(100)),
food_needed = 5*family_size + rnorm(length(family_size), 0, 1)
)
plot(refugee_food_needs)
```
## Single model version with simulation
```{r}
# select outcome variable
outcome <- refugee_food_needs %>% dplyr::select(food_needed)
# split testing and training data
split_proportion = 0.8 # specify proportion of data used for training
train_ind <- createDataPartition(outcome$food_needed, p = split_proportion, list = FALSE)
train_refugee_food_needs <- refugee_food_needs[train_ind,] # get training data
final_test_refugee_food_needs <- refugee_food_needs[-train_ind,] # get test data
# we'll train model on the data, then use it
# to simulate giving food with the testing data
# here's an example not with cross validation
lm_model <- lm(?)
simulate_outcomes <- function(policy, model, rows_to_simulate_with) {
browser()
outcomes <- tribble( ~policy, ~num_deaths) %>%
add_row(policy=policy)
if(policy == "linear_model_method"){
with_decisions <- augment(model, newdata=rows_to_simulate_with)
# simulate outcome of the decision
with_decisions <- with_decisions %>% mutate(
alive = ?
)
dead_families <- with_decisions %>% filter(alive == ?)
outcomes <- outcomes %>% mutate (num_deaths=sum(dead_families$?))
}
outcomes
}
simulate_outcomes("linear_model_method", lm_model, train_refugee_food_needs)
```
## What if we add the residual?
```{r}
residuals <- ?
simulate_outcomes <- function(policy, model, rows_to_simulate_with) {
outcomes <- tribble( ~policy, ~num_deaths) %>%
add_row(policy=policy)
if(policy == "linear_model_method"){
with_decisions <- augment(model, newdata=rows_to_simulate_with)
# simulate outcome of the decision
with_decisions <- with_decisions %>% mutate(
alive = ifelse(.fitted>=food_needed, 1, 0)
)
dead_families <- with_decisions %>% filter(alive == 0)
outcomes <- outcomes %>% mutate (num_deaths=sum(dead_families$family_size))
}
if(policy == "linear_model_method_with_residual"){
with_decisions <- augment(model, newdata=rows_to_simulate_with)
# simulate outcome of the decision
with_decisions <- with_decisions %>% mutate(
food_to_give = .fitted + <add residual here>,
alive = ifelse(food_to_give>=food_needed, 1, 0)
)
dead_families <- with_decisions %>% filter(alive == 0)
outcomes <- outcomes %>% mutate (num_deaths=sum(dead_families$family_size))
}
outcomes
}
bind_rows(
simulate_outcomes("linear_model_method", lm_model, train_refugee_food_needs),
simulate_outcomes("linear_model_method_with_residual", lm_model, train_refugee_food_needs))
```
## Extending cross-validation process to include search for decision
```{r}
#Randomly shuffle the order of the rows in the data
train_refugee_food_needs<-train_refugee_food_needs[sample(nrow(train_refugee_food_needs)),]
#Create 10 equally size folds within the training data
folds <- cut(seq(1,nrow(train_refugee_food_needs)),breaks=10,labels=FALSE)
# pattern is choice variables, outcome variables, c/v fold number
cv_outcomes <- tribble( ~policy, ~num_deaths, ~cv_fold_id)
#Perform 10 fold cross validation
for(i in 1:10){
#Segement your data by fold using the which() function
testIndexes <- which(folds==i,arr.ind=TRUE)
cv_testData <- train_refugee_food_needs[testIndexes, ]
cv_trainData <- train_refugee_food_needs[-testIndexes, ]
#Use the cv test and train data partitions
# first fit model
# then simulate
#Add outcomes to table
cv_outcomes <- bind_rows(cv_outcomes,new)
}
# visualize outcomes
#ggplot() + facet_grid(~ cv_fold_id)
# now at the end of your analysis, run on your overall test data
# that we held back from the beginning
```
## Extend decision simulation to include choice for "buffer"
Use a for loop to search over different possible values for buffer,
then choose the one with no deaths.