## Relevant Librarys

In [1]:
import requests
from bs4 import BeautifulSoup

import pandas as pd

## Player dictionary

    • Keys:   Quarterbacks with at least one Playoff appearance and a draft class 2000-2018
    • Values: Draft Round

In [2]:
draft_dic = {
    'Blake Bortles':1,
    'Tom Brady':6,
    'Teddy Bridgewater':1,
    'Kirk Cousins':4,
    'Nick Foles':3,
    'Jared Goff':1,
    'Lamar Jackson':1,
    'Case Keenum':8,
    'Andrew Luck':1,
    'Patrick Mahomes':1,
    'Eli Manning':1,
    'Dak Prescott':4,
    'Philip Rivers':1,
    'Aaron Rodgers':1,
    'Ben Roethlisberger':1,
    'Matt Ryan':1,
    'Matthew Stafford':1,
    'Mitchell Trubisky':1,
    'Deshaun Watson':1,
    'Russell Wilson':3,
    'Cam Newton':1,
    'Andy Dalton':2,
    'Colin Kaepernick':2,
    'Tyrod Taylor':6, 
    'Joe Flacco':1,
    'Michael Vick':1,
    'Carson Palmer':1,
    'Tony Romo':8,
    'Alex Smith':1,
    'Drew Brees':2,
    'Jay Cutler':1,
    'Robert Griffin III':1,
    'Brian Hoyer':8,
    'Brock Osweiler':2,
    'Marcus Mariota':1,
    'Lamar Jackson':1,
    'Mitchell Trubisky':1,
    'Matt Cassel':7,
    'Mark Sanchez':1,
    'Tim Tebow':1,
    'T.J. Yates':5,
    'Joe Webb':6,
    'Matt Schaub':3,
    'Ryan Lindley':6,
    'A.J. McCarron':5,
    'Connor Cook':4,
    'Matt Moore':8,
}

    •Draft round 8 == undrafted free agent (UDFA)

## Player list in alphabetical order

In [3]:
Qb_list = list(draft_dic.keys())
Qb_list.sort()

## Scraper Loop

    •This loop scrapes the stats of each players first season after the draft.

In [4]:
#///////////////////////////////////////////Initialize lists/////////////////////////////////////////////

attempts = []
completions = []
yards = []
touchdowns = []
interceptions = []
draft_rounds = []
game_starts = []




for player_name in Qb_list:
    
    
    
    #/////////////////////////////////////////////URL Preperation/////////////////////////////////////////////////
    # some players need a special index due to shared characters in other player names
    # which results in the same URL pattern
    
    if player_name in ['Dak Prescott','Robert Griffin III','Marcus Mariota','Matt Moore']:
        player_idx = 1
    
    elif player_name == 'Alex Smith':
        player_idx = 3
        
    else:
        player_idx = 0

    
    # break down player_name to use substrings in URLs of www.pro-football-reference.com

    first_name = (player_name.split())[0]
    last_name = (player_name.split())[1]

    last_name_first_letter = last_name[0]
    last_name_first_four = last_name[0:4]
    first_name_first_two = first_name[0:2]

    page = requests.get('https://www.pro-football-reference.com/players/{}/{}{}0{}.htm'.format(
                                                                                            last_name_first_letter,
                                                                                            last_name_first_four,
                                                                                            first_name_first_two,
                                                                                            player_idx
                                                                                                ))
    
    content = page.content
    soup = BeautifulSoup(content, 'html.parser')
    
    
    
    #/////////////////////////////////////////////Sort by year////////////////////////////////////////////////////
    soup_list = []
    for year in range(2000,2019):

        soup_list.append(soup.findAll("tr", {"id": "passing.{}".format(year)}))

        
           
    #//////////////////////////////////////////Find starting season//////////////////////////////////////////////
    starting_season = 0

    for bs4_element in soup_list:

        if bs4_element:   #the index of the first element marks the starting season
            break
        else:
            starting_season += 1
            
            

    #////////////////////////////////Extract and clean the Beautiful Soup element data///////////////////////////
    season = soup_list[starting_season]
    
    
    
    try:
        pass_att_raw = season[0].find("td", {"data-stat": "pass_att"})
        pass_att = int(list(pass_att_raw.children)[0])
        
    # filter digits due to <strong> tags within the stats
    except (TypeError):
        for character in list(pass_att_raw):     
            pass_att = ''.join(i for i in str(character) if i.isdigit())

            
    try:
        pass_cmp_raw = season[0].find("td", {"data-stat": "pass_cmp"})
        pass_cmp = int(list(pass_cmp_raw.children)[0])
    
    except (TypeError): 
        for character in list(pass_cmp_raw):     
            pass_cmp = ''.join(i for i in str(character) if i.isdigit())

            
    try:
        pass_yds_raw = season[0].find("td", {"data-stat": "pass_yds"})
        pass_yds = int(list(pass_yds_raw.children)[0])
    
    except (TypeError):
        for character in list(pass_yds_raw):     
            pass_yds = ''.join(i for i in str(character) if i.isdigit())

            
    try:
        pass_td_raw = season[0].find("td", {"data-stat": "pass_td"})
        pass_td = int(list(pass_td_raw.children)[0])
        
    except (TypeError):
        for character in list(pass_td_raw):     
            pass_td = ''.join(i for i in str(character) if i.isdigit())

            
    try:
        pass_int_raw = season[0].find("td", {"data-stat": "pass_int"})
        pass_int = int(list(pass_int_raw.children)[0])
        
    except (TypeError):
        for character in list(pass_int_raw):     
            pass_int = ''.join(i for i in str(character) if i.isdigit())

            
    try:
        games_raw = season[0].find("td", {"data-stat": "gs"})
        games = int(list(games_raw.children)[0])

    except (IndexError, TypeError): # IndexError due to empty cells on the website
            games = 0

        

    #///////////////////////////////////////////Append stats to lists////////////////////////////////////////////
    attempts.append(pass_att)        
    completions.append(pass_cmp)     
    yards.append(pass_yds)           
    touchdowns.append(pass_td)       
    interceptions.append(pass_int)   
    game_starts.append(games)        
    draft_rounds.append(draft_dic[player_name])

## Pandas Data Frame

In [5]:
data_dic = {'Games started':game_starts,
            'Attempts':attempts,
            'Completions':completions,
            'Yards':yards,
            'Touchdowns':touchdowns,
            'Interceptions':interceptions,
            'Draft Round':draft_rounds
           }

data = pd.DataFrame(data=data_dic, index=Qb_list)

In [6]:
data

Unnamed: 0,Games started,Attempts,Completions,Yards,Touchdowns,Interceptions,Draft Round
A.J. McCarron,0,0,0,0,0,0,5
Aaron Rodgers,0,16,9,65,0,1,1
Alex Smith,7,165,84,875,1,11,1
Andrew Luck,16,627,339,4374,23,18,1
Andy Dalton,16,516,300,3398,20,13,2
Ben Roethlisberger,13,295,196,2621,17,11,1
Blake Bortles,13,475,280,2908,11,17,1
Brian Hoyer,0,27,19,142,0,0,8
Brock Osweiler,0,4,2,12,0,0,2
Cam Newton,16,517,310,4051,21,17,1


In [7]:
data.to_csv('Stats_Season_1')