In [164]:
library(rvest)
library(tidyverse)

In [165]:
rushing_player <- data.frame()

for (i in 2014:2018){
    url <- paste0('https://www.pro-football-reference.com/years/', i, '/rushing.htm')
    page <- read_html(url)
    rank_data <- html_nodes(page, '#rushing')
    dat <- as.data.frame(html_table(rank_data))
    colnames(dat) <- dat[1,]
    dat <- dat[-1,]
    dat %>% filter(Tm != 'Tm') -> dat
    dat$year <- i
    dat$Player <- gsub('\\*?\\+?', '', dat$Player)
    
    rushing_player <- rbind(rushing_player, dat)
}

write.csv(rushing_player, file = 'Data/rushing_player.csv', row.names = F)

rushing_player <- read.csv(file  = 'Data/rushing_player.csv')
head(rushing_player)

Rk,Player,Tm,Age,Pos,G,GS,Att,Yds,TD,Lng,Y.A,Y.G,Fmb,year
1,DeMarco Murray,DAL,26,RB,16,16,392,1845,13,51,4.7,115.3,6,2014
2,LeSean McCoy,PHI,26,RB,16,16,312,1319,5,53,4.2,82.4,4,2014
3,Le'Veon Bell,PIT,22,RB,16,16,290,1361,8,81,4.7,85.1,0,2014
4,Marshawn Lynch,SEA,28,RB,16,14,280,1306,13,79,4.7,81.6,4,2014
5,Matt Forte,CHI,29,RB,16,16,266,1038,6,32,3.9,64.9,2,2014
6,Alfred Morris,WAS,26,RB,16,16,265,1074,8,30,4.1,67.1,2,2014


Now we want to make sure we are only looking at the leading rushers for each team in a given year.

In [166]:
rushing_player %>%
    group_by(Tm, year) %>%
    top_n(n = 1, wt = as.numeric(Yds)) -> top_rushers

In [167]:
rushing_team <- data.frame()

for (i in 2014:2018) {
    url <- paste0('https://www.pro-football-reference.com/years/', i, '/')

    url %>% read_html() %>%               # parse html
    html_nodes('#all_rushing') %>%        # select node with comment
    html_nodes(xpath = 'comment()') %>%   # select comments within node
    html_text() %>%                       # return contents as text
    read_html() %>%                       # parse text as html
    html_node('table') %>%                # select table node
    html_table() -> dat
    dat <- dat[-seq(nrow(dat)-2,nrow(dat)),]
    dat$year <- i
    
    rushing_team <- rbind(rushing_team, dat)
  }  
write.csv(rushing_team, file = 'Data/rushing_team.csv', row.names = F)
rushing_team <- read.csv(file = 'Data/rushing_team.csv')
head(rushing_team)

Rk,Tm,G,Att,Yds,TD,Lng,Y.A,Y.G,Fmb,EXP,year
1,Seattle Seahawks,16,525,2762,20,79,5.3,172.6,23,65.66,2014
2,Dallas Cowboys,16,508,2354,16,65,4.6,147.1,21,-17.9,2014
3,New York Jets,16,507,2280,11,71,4.5,142.5,25,-19.76,2014
4,San Francisco 49ers,16,470,2176,10,90,4.6,136.0,18,-32.96,2014
5,Houston Texans,16,551,2161,12,51,3.9,135.1,19,-44.9,2014
6,Cincinnati Bengals,16,492,2147,19,89,4.4,134.2,20,3.67,2014


In [168]:
ref <- read.csv('C:\\Users\\osnwh2\\PyProjects\\TeamAbbrev.csv')[,-1]
colnames(ref) <- c('Team', 'Tm', 'Conference', 'Division')

In [169]:
rushing_player <- merge(top_rushers, ref)

In [171]:
head(rushing_player)

Tm,Rk,Player,Age,Pos,G,GS,Att,Yds,TD,Lng,Y.A,Y.G,Fmb,year,Team,Conference,Division
ARI,3,David Johnson,27,RB,16,16,258,940,7,53,3.6,58.8,3,2018,Arizona Cardinals,NFC,West
ARI,39,Kerwynn Williams,26,rb,16,6,120,426,1,25,3.6,26.6,0,2017,Arizona Cardinals,NFC,West
ARI,3,David Johnson,25,RB,16,16,293,1239,16,58,4.2,77.4,5,2016,Arizona Cardinals,NFC,West
ARI,16,Chris Johnson,30,RB,11,9,196,814,3,62,4.2,74.0,2,2015,Arizona Cardinals,NFC,West
ARI,17,Andre Ellington,25,RB,12,12,201,660,3,22,3.3,55.0,2,2014,Arizona Cardinals,NFC,West
ATL,4,Devonta Freeman,23,RB,15,13,265,1056,11,39,4.0,70.4,3,2015,Atlanta Falcons,NFC,South


In [174]:
rushing_player %>%
select(year, Tm, Team, Player, Age, Pos, G, Att, Yds, TD, Fmb, Conference, Division) -> rushing_player

In [180]:
rushing_team %>%
select(Tm, Yds, Att, TD, Fmb, year) %>%
merge(rushing_player, by.x = c('Tm', 'year'), by.y = c('Team', 'year')) -> rushing
colnames(rushing) <- c('Team', 'Year', 'TeamYds', 'TeamAtt','TeamTD','TeamFmb','Abbv','Player','Age','Pos','G','PlayerAtt','PlayerYds','PlayerTD','PlayerFmb','Conference','Division')

In [181]:
rushing

Team,Year,TeamYds,TeamAtt,TeamTD,TeamFmb,Abbv,Player,Age,Pos,G,PlayerAtt,PlayerYds,PlayerTD,PlayerFmb,Conference,Division
Arizona Cardinals,2014,1308,397,6,16,ARI,Andre Ellington,25,RB,12,201,660,3,2,NFC,West
Arizona Cardinals,2015,1917,452,16,23,ARI,Chris Johnson,30,RB,11,196,814,3,2,NFC,West
Arizona Cardinals,2016,1732,399,20,27,ARI,David Johnson,25,RB,16,293,1239,16,5,NFC,West
Arizona Cardinals,2017,1386,410,6,19,ARI,Kerwynn Williams,26,rb,16,120,426,1,0,NFC,West
Arizona Cardinals,2018,1342,355,9,23,ARI,David Johnson,27,RB,16,258,940,7,3,NFC,West
Atlanta Falcons,2014,1498,372,11,16,ATL,Steven Jackson,31,RB,15,190,707,6,0,NFC,South
Atlanta Falcons,2015,1606,420,13,26,ATL,Devonta Freeman,23,RB,15,265,1056,11,3,NFC,South
Atlanta Falcons,2016,1928,421,20,8,ATL,Devonta Freeman,24,RB,16,227,1079,11,1,NFC,South
Atlanta Falcons,2017,1847,430,12,17,ATL,Devonta Freeman,25,RB,14,196,865,7,4,NFC,South
Atlanta Falcons,2018,1573,351,11,24,ATL,Tevin Coleman,25,RB,16,167,800,4,2,NFC,South


In [149]:
merge(top_rushers, rev, by.x = 'Tm', by.y = 'Abbreviation')

ERROR: Error in if (nx >= 2^31 || ny >= 2^31) stop("long vectors are not supported"): missing value where TRUE/FALSE needed
