In [1]:
#import necessary packages

import pandas as pd
import requests
from bs4 import BeautifulSoup as bs

## 1. Draft Order
Luckily, this information is contained in a pretty neat table, will be the easiest data to retrieve.

In [37]:
url = 'https://www.pro-football-reference.com/years/2021/draft.htm'
r = requests.get(url)
df = pd.read_html(url)[0]

In [38]:
df.head()

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Misc,Misc,Unnamed: 9_level_0,...,Rushing,Rushing,Receiving,Receiving,Receiving,Unnamed: 24_level_0,Unnamed: 25_level_0,Unnamed: 26_level_0,Unnamed: 27_level_0,Unnamed: 28_level_0
Unnamed: 0_level_1,Rnd,Pick,Tm,Player,Pos,Age,To,AP1,PB,St,...,Yds,TD,Rec,Yds,TD,Solo,Int,Sk,College/Univ,Unnamed: 28_level_1
0,1,1,JAX,Trevor Lawrence,QB,21,,0,0,0,...,,,,,,,,,Clemson,College Stats
1,1,2,NYJ,Zach Wilson,QB,22,,0,0,0,...,,,,,,,,,BYU,College Stats
2,1,3,SFO,Trey Lance,QB,21,,0,0,0,...,,,,,,,,,North Dakota St.,College Stats
3,1,4,ATL,Kyle Pitts,TE,20,,0,0,0,...,,,,,,,,,Florida,College Stats
4,1,5,CIN,Ja'Marr Chase,WR,21,,0,0,0,...,,,,,,,,,LSU,College Stats


It worked, but a little ugly. Need to get rid of the MultiIndex first.

In [39]:
df.columns = df.columns.get_level_values(-1)

In [40]:
df.head()

Unnamed: 0,Rnd,Pick,Tm,Player,Pos,Age,To,AP1,PB,St,...,Yds,TD,Rec,Yds.1,TD.1,Solo,Int,Sk,College/Univ,Unnamed: 28_level_1
0,1,1,JAX,Trevor Lawrence,QB,21,,0,0,0,...,,,,,,,,,Clemson,College Stats
1,1,2,NYJ,Zach Wilson,QB,22,,0,0,0,...,,,,,,,,,BYU,College Stats
2,1,3,SFO,Trey Lance,QB,21,,0,0,0,...,,,,,,,,,North Dakota St.,College Stats
3,1,4,ATL,Kyle Pitts,TE,20,,0,0,0,...,,,,,,,,,Florida,College Stats
4,1,5,CIN,Ja'Marr Chase,WR,21,,0,0,0,...,,,,,,,,,LSU,College Stats


Good, now just keep the relevant columns.

In [41]:
df = df[['Rnd', 'Pick', 'Tm', 'Player', 'College/Univ']]

In [42]:
df.head(40)

Unnamed: 0,Rnd,Pick,Tm,Player,College/Univ
0,1,1,JAX,Trevor Lawrence,Clemson
1,1,2,NYJ,Zach Wilson,BYU
2,1,3,SFO,Trey Lance,North Dakota St.
3,1,4,ATL,Kyle Pitts,Florida
4,1,5,CIN,Ja'Marr Chase,LSU
5,1,6,MIA,Jaylen Waddle,Alabama
6,1,7,DET,Penei Sewell,Oregon
7,1,8,CAR,Jaycee Horn,South Carolina
8,1,9,DEN,Patrick Surtain II,Alabama
9,1,10,PHI,DeVonta Smith,Alabama


And need to get rid of the subheaders that occur at the beginning of each round.

In [43]:
df = df[df['Rnd'] != 'Rnd']

In [44]:
df.head(40)

Unnamed: 0,Rnd,Pick,Tm,Player,College/Univ
0,1,1,JAX,Trevor Lawrence,Clemson
1,1,2,NYJ,Zach Wilson,BYU
2,1,3,SFO,Trey Lance,North Dakota St.
3,1,4,ATL,Kyle Pitts,Florida
4,1,5,CIN,Ja'Marr Chase,LSU
5,1,6,MIA,Jaylen Waddle,Alabama
6,1,7,DET,Penei Sewell,Oregon
7,1,8,CAR,Jaycee Horn,South Carolina
8,1,9,DEN,Patrick Surtain II,Alabama
9,1,10,PHI,DeVonta Smith,Alabama


## 2. All High Schools
Maybe not the most elegant route, but need a list of the cities where draftees played HS football. Will just add every listed high school from all 50 states to a large dataframe.

In [12]:
alabama_url = 'https://www.pro-football-reference.com/schools/high_schools.cgi?hs_state=AL'

alabamaHS = pd.read_html(alabama_url)[0]

alabamaHS.head(30)

Unnamed: 0,High School,HS City,HS State,Players,Active
0,Woodlawn,Birmingham,AL,14,1
1,S.S. Murphy,Mobile,AL,13,0
2,C.F. Vigor,Prichard,AL,13,0
3,Anniston,Anniston,AL,12,1
4,Central,Phenix City,AL,11,1
5,Robert E. Lee,Montgomery,AL,10,0
6,Sidney Lanier,Montgomery,AL,10,0
7,Mattie T. Blount,Prichard,AL,10,0
8,Jefferson Davis,Montgomery,AL,9,0
9,George Washington Carver,Montgomery,AL,9,1


Looks good, just need to loop through every state. Luckily I was able to find a premade list of all state abbreviations.

In [18]:
All_High_Schools = pd.DataFrame()

stateList = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

for state in stateList:
    state_url = 'https://www.pro-football-reference.com/schools/high_schools.cgi?hs_state=' +str(state)
    state_schools = pd.read_html(state_url)[0]
    All_High_Schools = All_High_Schools.append(state_schools)
    print(state)

AL
AK
AZ
AR
CA
CO
CT
DC
DE
FL
GA
HI
ID
IL
IN
IA
KS
KY
LA
ME
MD
MA
MI
MN
MS
MO
MT
NE
NV
NH
NJ
NM
NY
NC
ND
OH
OK
OR
PA
RI
SC
SD
TN
TX
UT
VT
VA
WA
WV
WI
WY


In [19]:
All_High_Schools

Unnamed: 0,High School,HS City,HS State,Players,Active
0,Woodlawn,Birmingham,AL,14,1
1,S.S. Murphy,Mobile,AL,13,0
2,C.F. Vigor,Prichard,AL,13,0
3,Anniston,Anniston,AL,12,1
4,Central,Phenix City,AL,11,1
...,...,...,...,...,...
16,Lovell,Lovell,WY,1,0
17,Pinedale,Pinedale,WY,1,0
18,Natrona County,Casper,WY,1,1
19,Rock Springs,Rock Springs,WY,1,0


In [21]:
All_High_Schools = All_High_Schools.drop(columns = ['Players', 'Active'])

In [22]:
All_High_Schools.head()

Unnamed: 0,High School,HS City,HS State
0,Woodlawn,Birmingham,AL
1,S.S. Murphy,Mobile,AL
2,C.F. Vigor,Prichard,AL
3,Anniston,Anniston,AL
4,Central,Phenix City,AL


## 3. Draftee's High Schools
This was probably the most complicated step. I also probably went at it in a pretty primitive manner (but I'm learning).

First, I needed to get a link to each player's bio page. I couldn't just plug the name into a url because the naming of said url's are pretty random when it comes to these pages.

In [23]:
draft_url = 'https://www.pro-football-reference.com/years/2021/draft.htm'

r = requests.get(draft_url)

draft_soup = bs(r.content, 'html.parser')
parsed_table = draft_soup.find_all('table')[0]

In [24]:
stub_list =[]
for i, row in enumerate(parsed_table.find_all('tr')[2:]):
    if row.find('td', attrs={'data-stat':'player'}) == None:
        continue
    data = row.find('td', attrs={'data-stat':'player'})
    name = data.a.get_text()
    stub = data.a.get('href')
    stub_list.append(stub)
    print(name, i, stub)

Trevor Lawrence 0 /players/L/LawrTr00.htm
Zach Wilson 1 /players/W/WilsZa00.htm
Trey Lance 2 /players/L/LancTr00.htm
Kyle Pitts 3 /players/P/PittKy00.htm
Ja'Marr Chase 4 /players/C/ChasJa00.htm
Jaylen Waddle 5 /players/W/WaddJa00.htm
Penei Sewell 6 /players/S/SewePe00.htm
Jaycee Horn 7 /players/H/HornJa00.htm
Patrick Surtain II 8 /players/S/SurtPa01.htm
DeVonta Smith 9 /players/S/SmitDe07.htm
Justin Fields 10 /players/F/FielJu00.htm
Micah Parsons 11 /players/P/ParsMi00.htm
Rashawn Slater 12 /players/S/SlatRa00.htm
Alijah Vera-Tucker 13 /players/V/VeraAl00.htm
Mac Jones 14 /players/J/JoneMa05.htm
Zaven Collins 15 /players/C/CollZa00.htm
Alex Leatherwood 16 /players/L/LeatAl00.htm
Jaelan Phillips 17 /players/P/PhilJa02.htm
Jamin Davis 18 /players/D/DaviJa11.htm
Kadarius Toney 19 /players/T/ToneKa00.htm
Kwity Paye 20 /players/P/PayeKw00.htm
Caleb Farley 21 /players/F/FarlCa00.htm
Christian Darrisaw 22 /players/D/DarrCh00.htm
Najee Harris 23 /players/H/HarrNa00.htm
Travis Etienne 24 /playe

Looks good. Next I need to go to each page and get the name of the high school and the state where they played. Unfortunately the information isn't in table form, so this part gets a bit messy.

I'll start by retrieving this info for Trevor Lawrence as an example of what I'm doing.

In [25]:
TrevorURL = 'https://www.pro-football-reference.com/players/L/LawrTr00.htm'
trev = requests.get(TrevorURL)

trevsoup = bs(trev.content)

trevsoup.find('h1').span.get_text()

'Trevor Lawrence'

The name is important, it will let me easily merge this data.

In [26]:
print(trevsoup.find('div', attrs={'id':'info'}))

<div class="players" id="info">
<div id="meta">
<div class="media-item"><img alt="Photo of Trevor Lawrence" itemscope="image" src="https://www.pro-football-reference.com/req/20180910/images/headshots/LawrTr00_2021_draft.jpg"/>
</div><!-- div.media-item --><div itemscope="" itemtype="https://schema.org/Person">
<h1 itemprop="name">
<span>Trevor Lawrence</span>
</h1>
<p>
<strong>
    William Trevor Lawrence
    
  </strong>
</p>
<p>
<strong>Position</strong>: QB
	
</p>
<p><span itemprop="height">6-6</span>, <span itemprop="weight">220lb</span> (198cm, 99kg) </p>
<p>
<strong>Born:</strong>
<span data-birth="1999-10-06" id="necro-birth" itemprop="birthDate">
<a href="/friv/birthdays.cgi?month=10&amp;day=6">October 6</a>, 1999
    </span>
<span itemprop="birthPlace">
    in Knoxville, <a href="/friv/birthplaces.cgi?state=TN">TN</a></span>
</p>
<p><strong>College</strong>: 
  
		<a href="/schools/clemson/">Clemson</a>
	
	
		 (<a href="https://www.sports-reference.com/cfb/players/trevor-lawre

In [27]:
trevsoup.find('div', attrs={'id':'info'}).find_all('p')[5].find_all('a')

[<a href="/schools/high_schools.cgi?id=93ba171d">Cartersville</a>,
 <a href="/schools/high_schools.cgi?hs_state=GA">GA</a>]

In [28]:
trevsoup.find('div', attrs={'id':'info'}).find_all('p')[5].find_all('a')[0].get_text()

'Cartersville'

In [29]:
trevsoup.find('div', attrs={'id':'info'}).find_all('p')[5].find_all('a')[1].get_text()

'GA'

There we go. Just need to do this for every player now.

In [30]:
highSchools = []

for stub in stub_list:
    player_url = 'https://www.pro-football-reference.com' + stub
    r = requests.get(player_url)
    soup = bs(r.content)
    
    name = soup.find('h1').span.get_text()
    
    hs = soup.find('div', attrs={'id':'info'}).find_all('p')[5].find_all('a')[0].get_text()
    
    hs_state = soup.find('div', attrs={'id':'info'}).find_all('p')[5].find_all('a')[1].get_text()
    
    dic = {}
    
    dic['Name'] = name
    dic['HS'] = hs
    dic['State'] = hs_state
    
    highSchools.append(dic)

In [31]:
PlayerHS = pd.DataFrame(highSchools)

In [32]:
PlayerHS

Unnamed: 0,Name,HS,State
0,Trevor Lawrence,Cartersville,GA
1,Zach Wilson,Corner Canyon,UT
2,Trey Lance,Marshall,MN
3,Kyle Pitts,Archbishop Wood,PA
4,Ja'Marr Chase,Archbishop Rummel,LA
...,...,...,...
254,Kawaan Baker,Hapeville Charter,GA
255,Kylin Hill,Columbus,MS
256,Jermar Jefferson,Redondo Union,CA
257,Dax Milne,Bingham,UT


## 4. Merging
Finally, merge all the data

In [34]:
PlayerHS.head()

Unnamed: 0,Name,HS,State
0,Trevor Lawrence,Cartersville,GA
1,Zach Wilson,Corner Canyon,UT
2,Trey Lance,Marshall,MN
3,Kyle Pitts,Archbishop Wood,PA
4,Ja'Marr Chase,Archbishop Rummel,LA


In [35]:
All_High_Schools.head()

Unnamed: 0,High School,HS City,HS State
0,Woodlawn,Birmingham,AL
1,S.S. Murphy,Mobile,AL
2,C.F. Vigor,Prichard,AL
3,Anniston,Anniston,AL
4,Central,Phenix City,AL


In [45]:
df.head()

Unnamed: 0,Rnd,Pick,Tm,Player,College/Univ
0,1,1,JAX,Trevor Lawrence,Clemson
1,1,2,NYJ,Zach Wilson,BYU
2,1,3,SFO,Trey Lance,North Dakota St.
3,1,4,ATL,Kyle Pitts,Florida
4,1,5,CIN,Ja'Marr Chase,LSU


I'll go ahead and make common column names to make this easier

In [47]:
All_High_Schools = All_High_Schools.rename(columns = {'HS State': 'State', 'High School': 'HS'})

In [53]:
first_merge = PlayerHS.merge(All_High_Schools, 
               on = ['HS', 'State'], 
               how='left')

In [61]:
first_merge[first_merge.duplicated(['Name', 'State'])]

Unnamed: 0,Name,HS,State,HS City
3,Trey Lance,Marshall,MN,Marshall
18,Alex Leatherwood,Booker T. Washington,FL,Pensacola
36,Elijah Moore,St. Thomas Aquinas,FL,Deerfield Beach
49,Jackson Carman,Fairfield,OH,Fairfield
51,Asante Samuel Jr.,St. Thomas Aquinas,FL,Deerfield Beach
82,Josh Palmer,St. Thomas Aquinas,FL,Deerfield Beach
116,James Hudson,Central Catholic,OH,Canton
117,James Hudson,Central Catholic,OH,Steubenville
118,James Hudson,Central Catholic,OH,Cleveland
119,James Hudson,Central Catholic,OH,Lima


In [65]:
first_merge = first_merge.drop([2,17,36,48,51,82,116,117,
                 118,119,122,127,137,138,
                 139,146,156,158,162,163,
                 180,204,209,221,236,237,
                 238,264,266,268,279])

In [66]:
first_merge

Unnamed: 0,Name,HS,State,HS City
0,Trevor Lawrence,Cartersville,GA,Cartersville
1,Zach Wilson,Corner Canyon,UT,Draper
3,Trey Lance,Marshall,MN,Marshall
4,Kyle Pitts,Archbishop Wood,PA,Warminster
5,Ja'Marr Chase,Archbishop Rummel,LA,Metairie
...,...,...,...,...
285,Kawaan Baker,Hapeville Charter,GA,Union City
286,Kylin Hill,Columbus,MS,Columbus
287,Jermar Jefferson,Redondo Union,CA,Redondo Beach
288,Dax Milne,Bingham,UT,South Jordan


In [71]:
first_merge = first_merge.reset_index()

In [73]:
df = df.reset_index()

In [81]:
df = df.drop(columns=['index'])

In [83]:
final = df.join(first_merge, how='outer')

In [85]:
final = final.drop(columns=['index', 'Name'])

In [86]:
final

Unnamed: 0,Rnd,Pick,Tm,Player,College/Univ,HS,State,HS City
0,1,1,JAX,Trevor Lawrence,Clemson,Cartersville,GA,Cartersville
1,1,2,NYJ,Zach Wilson,BYU,Corner Canyon,UT,Draper
2,1,3,SFO,Trey Lance,North Dakota St.,Marshall,MN,Marshall
3,1,4,ATL,Kyle Pitts,Florida,Archbishop Wood,PA,Warminster
4,1,5,CIN,Ja'Marr Chase,LSU,Archbishop Rummel,LA,Metairie
...,...,...,...,...,...,...,...,...
254,7,255,NOR,Kawaan Baker,South Alabama,Hapeville Charter,GA,Union City
255,7,256,GNB,Kylin Hill,Mississippi St.,Columbus,MS,Columbus
256,7,257,DET,Jermar Jefferson,Oregon St.,Redondo Union,CA,Redondo Beach
257,7,258,WAS,Dax Milne,BYU,Bingham,UT,South Jordan


In [87]:
final[final.isnull().any(axis=1)]

Unnamed: 0,Rnd,Pick,Tm,Player,College/Univ,HS,State,HS City
10,1,11,CHI,Justin Fields,Ohio St.,Harrison,GA,
73,3,74,WAS,Benjamin St-Juste,Minnesota,Montreal Cegep du Vieux,Canada,
125,4,126,CAR,Chuba Hubbard,Oklahoma St.,Bev Facey,Canada,


In [90]:
final = final.rename(columns={'HS City':'City'})

In [96]:
final['City'][10] = 'Kennesaw'

In [97]:
final['City'][10]

'Kennesaw'

In [98]:
final['City'][125] = 'Sherwood Park'

In [99]:
final['City'][73] = 'Montreal'

In [100]:
final

Unnamed: 0,Rnd,Pick,Tm,Player,College/Univ,HS,State,City
0,1,1,JAX,Trevor Lawrence,Clemson,Cartersville,GA,Cartersville
1,1,2,NYJ,Zach Wilson,BYU,Corner Canyon,UT,Draper
2,1,3,SFO,Trey Lance,North Dakota St.,Marshall,MN,Marshall
3,1,4,ATL,Kyle Pitts,Florida,Archbishop Wood,PA,Warminster
4,1,5,CIN,Ja'Marr Chase,LSU,Archbishop Rummel,LA,Metairie
...,...,...,...,...,...,...,...,...
254,7,255,NOR,Kawaan Baker,South Alabama,Hapeville Charter,GA,Union City
255,7,256,GNB,Kylin Hill,Mississippi St.,Columbus,MS,Columbus
256,7,257,DET,Jermar Jefferson,Oregon St.,Redondo Union,CA,Redondo Beach
257,7,258,WAS,Dax Milne,BYU,Bingham,UT,South Jordan


In [101]:
final.to_csv('NFL_draft.csv')