-
Notifications
You must be signed in to change notification settings - Fork 0
/
ReportByCounty.Rmd
141 lines (88 loc) · 4.7 KB
/
ReportByCounty.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
---
title: "Report By County"
author: "Runan Yao"
date: "November 1, 2018"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(randomForest)
library(dplyr)
```
```{r}
dat.set <- read.csv('Data/CountyDataset6.csv')
colnames(dat.set)
dat.set$SenR <- dat.set$SenR/dat.set$TotalPop
dat.set$SenD <- dat.set$SenD/dat.set$TotalPop
dat.set$SenThird <- dat.set$SenThird/dat.set$TotalPop
dat.set$CongR <- dat.set$CongR/dat.set$TotalPop
dat.set$CongD <- dat.set$CongD/dat.set$TotalPop
dat.set$CongThird <- dat.set$CongThird/dat.set$TotalPop
dat.set$GovR <- dat.set$GovR/dat.set$TotalPop
dat.set$GovD <- dat.set$GovD/dat.set$TotalPop
dat.set$GovThird <- dat.set$GovThird/dat.set$TotalPop
dat.set$R <- ifelse(dat.set$SenR > dat.set$CongR,
ifelse( dat.set$SenR > dat.set$GovR, dat.set$SenR, dat.set$GovR),
ifelse( dat.set$CongR > dat.set$GovR, dat.set$CongR, dat.set$GovR))
dat.set$D <- ifelse(dat.set$SenD > dat.set$CongD,
ifelse( dat.set$SenD > dat.set$GovD, dat.set$SenD, dat.set$GovD),
ifelse( dat.set$CongD > dat.set$GovD, dat.set$CongD, dat.set$GovD))
dat.set$Third <- ifelse(dat.set$SenThird > dat.set$CongThird,
ifelse( dat.set$SenThird > dat.set$GovThird, dat.set$SenThird, dat.set$GovThird),
ifelse( dat.set$CongThird > dat.set$GovThird, dat.set$CongThird, dat.set$GovThird))
dat.set$IsPresR <- ifelse(dat.set$Year < 2017, 0, 1)
training.tar <- dat.set %>%
select(one_of(c('R', 'D', 'Third')))
training.dat <- dat.set %>% select(one_of(c('PRESyear', 'SENyear', 'GOVyear', 'Age25to44', 'Age45to64', 'Over65', 'Age0to24', "White", "Black" ,"Natives","Asian","Hispanic","PacificIslander", "Unemployment.Rate", "Land.Area","PopulationDensity", "IsPresR")))
#, "SENincumbent", "GOVincumbent"
training.tar[which(is.na(training.tar))] <- 0.01
training.dat[which(is.na(training.dat))] <- 0.01
training.tar[which(training.tar$R == 0),]$R <- 0.01
error = 0
for( i in 1:100){
idx <- sample(nrow(training.dat), 100)
t.tar <- training.tar[-idx,]$R
t.dat <- training.dat[-idx,]
tst.dat <- training.dat[idx,]
tst.tar <- training.tar[idx,]$R
Repforest <- randomForest(x = t.dat, y = t.tar)
prd <- predict(Repforest, tst.dat)
error <- error + mean(abs((prd-tst.tar)/tst.tar))
}
error/100
Repforest$importance[order(Repforest$importance, decreasing = TRUE),]
DFLforest$importance[order(DFLforest$importance, decreasing = TRUE),]
RepLm <- lm(training.tar$R ~ training.dat$GOVyear + training.dat$PopulationDensity + training.dat$Over65 + training.dat$SENyear + training.dat$Age25to44 + training.dat$Age45to64 + training.dat$Black + training.dat$Unemployment.Rate)
summary(RepLm)
DFLLm <- lm(training.tar$D ~ training.dat$GOVyear + training.dat$PopulationDensity + training.dat$Over65 + training.dat$SENyear + training.dat$Age25to44 + training.dat$Age45to64 + training.dat$Black + training.dat$Unemployment.Rate)
summary(DFLLm)
```
```{r}
dat.set <- read.csv('Data/CountyDataset6.csv')
colnames(dat.set)
dat.set$R <- ifelse(dat.set$SenR > dat.set$CongR,
ifelse( dat.set$SenR > dat.set$GovR, dat.set$SenR, dat.set$GovR),
ifelse( dat.set$CongR > dat.set$GovR, dat.set$CongR, dat.set$GovR))
dat.set$D <- ifelse(dat.set$SenD > dat.set$CongD,
ifelse( dat.set$SenD > dat.set$GovD, dat.set$SenD, dat.set$GovD),
ifelse( dat.set$CongD > dat.set$GovD, dat.set$CongD, dat.set$GovD))
dat.set$Third <- ifelse(dat.set$SenThird > dat.set$CongThird,
ifelse( dat.set$SenThird > dat.set$GovThird, dat.set$SenThird, dat.set$GovThird),
ifelse( dat.set$CongThird > dat.set$GovThird, dat.set$CongThird, dat.set$GovThird))
training.tar <- dat.set %>%
select(one_of(c('R', 'D', 'Third')))
training.dat <- dat.set %>% select(one_of(c('PRESyear', 'SENyear', 'GOVyear', 'Age25to44', 'Age45to64', 'Over65', 'Age0to24', "White", "Black" ,"Natives","Asian","Hispanic","PacificIslander", "Unemployment.Rate", "Land.Area","PopulationDensity", "IsPresR")))
training.tar[which(is.na(training.tar))] <- 0.01
training.dat[which(is.na(training.dat))] <- 0.01
training.tar[which(training.tar$R == 0),]$R <- 0.01
t.idx <- which(dat.set$Year == 2016)
Repforest <- randomForest(x = training.dat, y = training.tar$R)
test.dat <- training.dat[t.idx,]
errorValue <- predict(Repforest,test.dat) - training.tar$R[t.idx]
error <- abs(errorValue)
DFLforest <- randomForest(x = training.dat, y = training.tar$D)
test.dat <- training.dat[t.idx,]
errorValue <- predict(DFLforest,test.dat) - training.tar$R[t.idx]
error <- abs(errorValue) + error
sum(error) / sum(training.tar[t.idx,])
```