In [1]:
import re
import urllib
from time import sleep
from bs4 import BeautifulSoup
import pandas as pd
from PlayerBioScraper import PlayerBioScraper

source: https://github.com/erilu/web-scraping-NBA-statistics/blob/master/scrape-nba-statistics-notebook.ipynb

## Get Team URLS


In [2]:
def build_team_urls():
    # Define the URL of the ESPN NFL teams webpage.
    espn_teams_url = 'https://www.espn.com/nfl/teams'
    
    # Open the ESPN teams webpage and extract the names of each roster available.
    with urllib.request.urlopen(espn_teams_url) as f:
        teams_source = f.read().decode('utf-8')
    
    # Use regular expressions to find team names and their corresponding URLs.
    teams_data = re.findall(r"www\.espn\.com/nfl/team/_/name/(\w+)/(.+?)\",", teams_source)
    
    # Create a dictionary to store team names and their corresponding roster URLs.
    team_urls = {}
    
    for team_id, team_name in teams_data:
        # Construct the roster URL for each team.
        roster_url = f'https://www.espn.com/nfl/team/roster/_/name/{team_id}/{team_name}'
        
        # Store the team name and roster URL in the dictionary.
        team_urls[team_name] = roster_url
    
    return team_urls

rosters = build_team_urls()
rosters

## Get Dict of Players and Links

In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService

# Specify the path to chromedriver.exe on your system
chrome_path = 'chromedriver.exe'  # Replace with the actual path to chromedriver.exe

# Initialize ChromeService with the executable path
chrome_service = ChromeService(executable_path=chrome_path)

# Initialize Chrome WebDriver with the service
driver = webdriver.Chrome(service=chrome_service)

# Open the URL
url = "https://www.espn.com/nfl/team/roster/_/name/den/denver-broncos"
driver.get(url)

# Wait for JavaScript to load content (you may need to adjust the wait time)
driver.implicitly_wait(10)

# Extract the page source after JavaScript has loaded
roster_source = driver.page_source

soup = BeautifulSoup(roster_source, 'html.parser')

# Close the Selenium WebDriver
driver.quit()

In [4]:
# Define the regex pattern for matching links
link_pattern = re.compile(r'https?://www\.espn\.com/nfl/player/_/id/\d+/[^\s"]+')

# Find all 'a' elements in the HTML
a_elements = soup.find_all('a')

# Create a dictionary to store player names as keys and their corresponding links as values
player_links_dict = {}

# Extract and store player names and links that match the pattern
for a_element in a_elements:
    href = a_element.get('href')
    player_name = a_element.get_text()
    if href and re.match(link_pattern, href):
        # Only add entries to the dictionary if both player name and href are not empty
        if player_name.strip() and href.strip():
            player_links_dict[player_name] = href

# Print the dictionary
for player, link in player_links_dict.items():
    print(f'{player}: {link}')

Ben DiNucci: http://www.espn.com/nfl/player/_/id/3895785/ben-dinucci
Jarrett Stidham: http://www.espn.com/nfl/player/_/id/3892775/jarrett-stidham
Russell Wilson: http://www.espn.com/nfl/player/_/id/14881/russell-wilson
Tyler Badie: http://www.espn.com/nfl/player/_/id/4362748/tyler-badie
Jaleel McLaughlin: http://www.espn.com/nfl/player/_/id/4722893/jaleel-mclaughlin
Samaje Perine: http://www.espn.com/nfl/player/_/id/3116389/samaje-perine
Dwayne Washington: http://www.espn.com/nfl/player/_/id/3002265/dwayne-washington
Javonte Williams: http://www.espn.com/nfl/player/_/id/4361579/javonte-williams
Michael Burton: http://www.espn.com/nfl/player/_/id/2515270/michael-burton
Michael Bandy: http://www.espn.com/nfl/player/_/id/4034704/michael-bandy
Phillip Dorsett: http://www.espn.com/nfl/player/_/id/2579604/phillip-dorsett
Lil'Jordan Humphrey: http://www.espn.com/nfl/player/_/id/4039057/liljordan-humphrey
Jerry Jeudy: http://www.espn.com/nfl/player/_/id/4241463/jerry-jeudy
Brandon Johnson: htt

## Get Player Bio Info

In [5]:
import re
import urllib
from time import sleep
from bs4 import BeautifulSoup
import pandas as pd
from PlayerBioScraper import PlayerBioScraper

scraper = PlayerBioScraper()
scraper.scrape_player_info(player_links_dict)
#scraper.scrape_player_info({"Ben DiNucci": "http://www.espn.com/nfl/player/_/id/3895785/ben-dinucci"})
#scraper.scrape_player_info({"Ben DiNucci": "http://www.espn.com/nfl/player/_/id/3895785/ben-dinucci", "Dwayne Washington": "http://www.espn.com/nfl/player/_/id/3002265/dwayne-washington"})
scraper.export_to_csv()
player_bio = scraper.get_all_player_info_df()
player_bio

['2020', 'Rd 7', 'Pk 231', 'DAL)']
this is player info  {'Team': 'Broncos', 'Number': '#11', 'Position': 'QB', 'HT/WT': '6\' 2", 215 lbs', 'Birthdate': '11/24/1996 (26)', 'College': 'James Madison', 'Draft Info': '2020: Rd 7, Pk 231 (DAL)', 'Draft_Year': '2020', 'Draft_Round': '7', 'Draft_Pick_Overall': '231', 'Draft_Team': 'DAL'}
['2019', 'Rd 4', 'Pk 133', 'NE)']
this is player info  {'Team': 'Broncos', 'Number': '#4', 'Position': 'QB', 'HT/WT': '6\' 3", 215 lbs', 'Birthdate': '8/8/1996 (27)', 'College': 'Auburn', 'Draft Info': '2019: Rd 4, Pk 133 (NE)', 'Draft_Year': '2019', 'Draft_Round': '4', 'Draft_Pick_Overall': '133', 'Draft_Team': 'NE'}
['2012', 'Rd 3', 'Pk 75', 'SEA)']
this is player info  {'Team': 'Broncos', 'Number': '#3', 'Position': 'QB', 'HT/WT': '5\' 11", 215 lbs', 'Birthdate': '11/29/1988 (34)', 'College': 'Wisconsin', 'Draft Info': '2012: Rd 3, Pk 75 (SEA)', 'Draft_Year': '2012', 'Draft_Round': '3', 'Draft_Pick_Overall': '75', 'Draft_Team': 'SEA'}
['2022', 'Rd 6', 'Pk 

Unnamed: 0,Team,Number,Position,Birthdate,College,Draft_Year,Draft_Round,Draft_Pick_Overall,Draft_Team,Height,Weight,Age
Ben DiNucci,Broncos,#11,QB,11/24/1996,James Madison,2020,7,231,DAL,6' 2,215 lbs,26
Jarrett Stidham,Broncos,#4,QB,8/8/1996,Auburn,2019,4,133,NE,6' 3,215 lbs,27
Russell Wilson,Broncos,#3,QB,11/29/1988,Wisconsin,2012,3,75,SEA,5' 11,215 lbs,34
Tyler Badie,Broncos,#36,RB,2/7/2000,Missouri,2022,6,196,BAL,5' 8,197 lbs,23
Jaleel McLaughlin,Broncos,#38,RB,9/13/2000,Youngstown St,undrafted,undrafted,undrafted,undrafted,5' 7,187 lbs,23
...,...,...,...,...,...,...,...,...,...,...,...,...
Caden Sterns,Broncos,#30,S,11/2/1999,Texas,2021,5,152,DEN,6' 1,207 lbs,23
Delarrin Turner-Yell,Broncos,#32,S,12/16/1999,Oklahoma,2022,5,152,DEN,5' 11,200 lbs,23
Wil Lutz,Broncos,#16,PK,7/7/1994,Georgia State,undrafted,undrafted,undrafted,undrafted,5' 11,184 lbs,29
Riley Dixon,Broncos,#9,P,8/24/1993,Syracuse,2016,7,228,DEN,6' 4,221 lbs,30


In [144]:
player_bio

Unnamed: 0,Team,Number,Position,HT/WT,Birthdate,College,Draft Info
Ben DiNucci,Denver Broncos,#11,Quarterback,"6' 2"", 215 lbs",11/24/1996 (26),James Madison,"2020: Rd 7, Pk 231 (DAL)"
Jarrett Stidham,Denver Broncos,#4,Quarterback,"6' 3"", 215 lbs",8/8/1996 (27),Auburn,"2019: Rd 4, Pk 133 (NE)"
Russell Wilson,Denver Broncos,#3,Quarterback,"5' 11"", 215 lbs",11/29/1988 (34),Wisconsin,"2012: Rd 3, Pk 75 (SEA)"
Tyler Badie,Denver Broncos,#36,Running Back,"5' 8"", 197 lbs",2/7/2000 (23),Missouri,"2022: Rd 6, Pk 196 (BAL)"
Jaleel McLaughlin,Denver Broncos,#38,Running Back,"5' 7"", 187 lbs",9/13/2000 (23),Youngstown St,Not Available
...,...,...,...,...,...,...,...
Caden Sterns,Denver Broncos,#30,Safety,"6' 1"", 207 lbs",11/2/1999 (23),Texas,"2021: Rd 5, Pk 152 (DEN)"
Delarrin Turner-Yell,Denver Broncos,#32,Safety,"5' 11"", 200 lbs",12/16/1999 (23),Oklahoma,"2022: Rd 5, Pk 152 (DEN)"
Wil Lutz,Denver Broncos,#16,Place Kicker,"5' 11"", 184 lbs",7/7/1994 (29),Georgia State,Not Available
Riley Dixon,Denver Broncos,#9,Punter,"6' 4"", 221 lbs",8/24/1993 (30),Syracuse,"2016: Rd 7, Pk 228 (DEN)"


predict draft position, based on position, height, college (as education)

RBS and WRS gleiche stats

In [11]:


player_splits_link = "https://www.espn.com/nfl/player/splits/_/id/3116389/samaje-perine"

# Specify the path to chromedriver.exe on your system
chrome_path = 'chromedriver.exe'  # Replace with the actual path to chromedriver.exe

# Initialize ChromeService with the executable path
chrome_service = ChromeService(executable_path=chrome_path)

# Initialize Chrome WebDriver with the service
driver = webdriver.Chrome(service=chrome_service)

# Open the URL
driver.get(player_splits_link)

# Wait for JavaScript to load content (you may need to adjust the wait time)
driver.implicitly_wait(10)

# Extract the page source after JavaScript has loaded
player_splits_page = driver.page_source

# Now you can use BeautifulSoup to parse the dynamically loaded content
soup = BeautifulSoup(player_splits_page, 'html.parser')



In [12]:
print(soup)

<html data-react-helmet="lang" lang="en"><head><script src="//javascript:;"></script><script src="https://secure.espn.com/js/dcf/tags/vision/latest/vision-videojs.js"></script><script src="//javascript:;"></script><script async="" src="//javascript:;"></script><script async="" src="//javascript:;"></script><script async="" src="//javascript:;"></script><script async="" src="https://dcf.espn.com/TWDC-DTCI/prod/code/b07dcd916055909c90cdc631ee66e80b.js?conditionId0=4905157" type="text/javascript"></script><script async="" src="https://dcf.espn.com/TWDC-DTCI/prod/code/0b62e6477d4c5febd5238c72e8724d4a.js?conditionId0=4924177" type="text/javascript"></script><script async="" src="https://dcf.espn.com/TWDC-DTCI/prod/code/b2dff6cf1a5442f2eb32b8b4f227d819.js?conditionId0=4872138" type="text/javascript"></script><script async="" src="https://dcf.espn.com/TWDC-DTCI/prod/code/ec168911d3f05ac4f0c654256f91146b.js?conditionId0=4884623" type="text/javascript"></script><script async="" src="https://dcf

In [13]:
table = soup.find('div', class_='ResponsiveTable ResponsiveTable--fixed-left player-splits-table')
print(table)

<div class="ResponsiveTable ResponsiveTable--fixed-left player-splits-table"><div class="flex"><table class="Table Table--align-right Table--fixed Table--fixed-left" style="border-collapse: collapse; border-spacing: 0px;"><colgroup class="Table__Colgroup"><col class="Table__Column"/></colgroup><thead class="Table__header-group Table__THEAD"><tr class="Table__TR Table__even" style="height: auto;"><th class="Table__TH" colspan="1" title=""></th></tr></thead><tbody class="Table__TBODY"><tr class="subgroup-headers Table__TR Table__TR--sm Table__even" data-idx="0" style="height: auto;"><td class="fw-medium Table__TD"><span class="fw-medium">split</span></td></tr><tr class="Table__TR Table__TR--sm Table__even" data-idx="1" style="height: auto;"><td class="Table__TD">All Splits</td></tr><tr class="Table__TR Table__TR--sm Table__even" data-idx="2" style="height: auto;"><td class="Table__TD">Home</td></tr><tr class="Table__TR Table__TR--sm Table__even" data-idx="3" style="height: auto;"><td cla

In [14]:
table = soup.find('div', class_='Table__Scroller')
print(table)

<div class="Table__Scroller"><table class="Table Table--align-right" style="border-collapse: collapse; border-spacing: 0px;"><colgroup class="Table__Colgroup"><col class="Table__Column"/><col class="Table__Column"/><col class="Table__Column"/><col class="Table__Column"/><col class="Table__Column"/></colgroup><colgroup class="Table__Colgroup"><col class="Table__Column"/><col class="Table__Column"/><col class="Table__Column"/><col class="Table__Column"/><col class="Table__Column"/></colgroup><colgroup class="Table__Colgroup"><col class="Table__Column"/><col class="Table__Column"/></colgroup><thead class="Table__header-group Table__THEAD"><tr class="Table__TR Table__even"><th class="tc Table__TH" colspan="5" title="">rushing</th><th class="tc Table__TH" colspan="5" title="">receiving</th><th class="tc Table__TH" colspan="2" title="">fumbles</th></tr></thead><tbody class="Table__TBODY"><tr class="subgroup-headers Table__TR Table__TR--sm Table__even" data-idx="0"><td class="fw-medium Table_

In [15]:
mydivs = soup.find_all("tbody", {"class":"Table__TBODY"})

rows = mydivs[0].find_all('tr')

for row in rows:
    print(row.text)
    

rows = mydivs[1].find_all('tr')

for row in rows:
    data = row.find_all('td')
    for d in data:
        print(d.text)

split
All Splits
Home
Away
Outcome
Wins/Ties
Losses
Victory Margin
0-7
Close
4th QTR, +/-7
Last 2 Min. Half
Season Games
1-8
Month
September
October
DAY
Sunday
Other
Surface
Grass
Location
Outdoors
Weather
81+ F
Depth Chart
As Starter
As Sub
Group
vs AFC
vs NFC
vs Div
Opponent
vs CHI
vs GB
vs KC
vs MIA
vs NYJ
vs OAK
vs WAS
Half
1st
2nd
Quarter
1st
2nd
3rd
4th
Down
1st
2nd
3rd
Pt Diff
Ahead
Behind
Tied
Field Position
Own 1-20
Own 21-50
Opp 49-20
Opp 19-Goal
CAR
YDS
AVG
TD
LNG
REC
YDS
AVG
TD
LNG
FUM
LST
26
98
3.8
0
12
20
215
10.8
0
29
2
2
17
77
4.5
0
12
14
161
11.5
0
29
1
1
9
21
2.3
0
5
6
54
9.0
0
15
1
1
CAR
YDS
AVG
TD
LNG
REC
YDS
AVG
TD
LNG
FUM
LST
8
22
2.8
0
5
5
54
10.8
0
19
0
0
18
76
4.2
0
12
15
161
10.7
0
29
2
2
CAR
YDS
AVG
TD
LNG
REC
YDS
AVG
TD
LNG
FUM
LST
8
22
2.8
0
5
5
54
10.8
0
19
0
0
CAR
YDS
AVG
TD
LNG
REC
YDS
AVG
TD
LNG
FUM
LST
4
17
4.3
0
12
1
4
4.0
0
4
0
0
1
-1
-1.0
0
-1
11
79
7.2
0
15
1
1
CAR
YDS
AVG
TD
LNG
REC
YDS
AVG
TD
LNG
FUM
LST
26
98
3.8
0
12
20
215
10.8
0
29
2
2
CAR
YD

In [16]:
mydivs = soup.find_all("tbody", {"class":"Table__TBODY"})


rows = mydivs[0].find_all('tr')
col_list = []
for row in rows:
    # Dictionary Keys
    col_list.append(row.text)

rows = mydivs[1].find_all('tr')

dict = {}
for index, row in enumerate(rows):
    data = row.find_all('td')
    # Dictionary Values (as list)
    temp_list = []
    for d in data:
        # Add these to list
        temp_list.append(d.text)
    dict[col_list[index]] = temp_list



print(dict)
dict

# Create a DataFrame
df = pd.DataFrame(dict)
df

{'split': ['CAR', 'YDS', 'AVG', 'TD', 'LNG', 'REC', 'YDS', 'AVG', 'TD', 'LNG', 'FUM', 'LST'], 'All Splits': ['26', '98', '3.8', '0', '12', '20', '215', '10.8', '0', '29', '2', '2'], 'Home': ['17', '77', '4.5', '0', '12', '14', '161', '11.5', '0', '29', '1', '1'], 'Away': ['9', '21', '2.3', '0', '5', '6', '54', '9.0', '0', '15', '1', '1'], 'Outcome': ['CAR', 'YDS', 'AVG', 'TD', 'LNG', 'REC', 'YDS', 'AVG', 'TD', 'LNG', 'FUM', 'LST'], 'Wins/Ties': ['8', '22', '2.8', '0', '5', '5', '54', '10.8', '0', '19', '0', '0'], 'Losses': ['18', '76', '4.2', '0', '12', '15', '161', '10.7', '0', '29', '2', '2'], 'Victory Margin': ['CAR', 'YDS', 'AVG', 'TD', 'LNG', 'REC', 'YDS', 'AVG', 'TD', 'LNG', 'FUM', 'LST'], '0-7': ['8', '22', '2.8', '0', '5', '5', '54', '10.8', '0', '19', '0', '0'], 'Close': ['CAR', 'YDS', 'AVG', 'TD', 'LNG', 'REC', 'YDS', 'AVG', 'TD', 'LNG', 'FUM', 'LST'], '4th QTR, +/-7': ['4', '17', '4.3', '0', '12', '1', '4', '4.0', '0', '4', '0', '0'], 'Last 2 Min. Half': ['1', '-1', '-1.0', 

Unnamed: 0,split,All Splits,Home,Away,Outcome,Wins/Ties,Losses,Victory Margin,0-7,Close,...,Down,Pt Diff,Ahead,Behind,Tied,Field Position,Own 1-20,Own 21-50,Opp 49-20,Opp 19-Goal
0,CAR,26.0,17.0,9.0,CAR,8.0,18.0,CAR,8.0,CAR,...,CAR,CAR,8.0,14.0,4.0,CAR,1.0,8.0,11.0,6.0
1,YDS,98.0,77.0,21.0,YDS,22.0,76.0,YDS,22.0,YDS,...,YDS,YDS,46.0,45.0,7.0,YDS,8.0,38.0,31.0,21.0
2,AVG,3.8,4.5,2.3,AVG,2.8,4.2,AVG,2.8,AVG,...,AVG,AVG,5.8,3.2,1.8,AVG,8.0,4.8,2.8,3.5
3,TD,0.0,0.0,0.0,TD,0.0,0.0,TD,0.0,TD,...,TD,TD,0.0,0.0,0.0,TD,0.0,0.0,0.0,0.0
4,LNG,12.0,12.0,5.0,LNG,5.0,12.0,LNG,5.0,LNG,...,LNG,LNG,12.0,10.0,4.0,LNG,8.0,11.0,12.0,8.0
5,REC,20.0,14.0,6.0,REC,5.0,15.0,REC,5.0,REC,...,REC,REC,5.0,13.0,2.0,REC,0.0,12.0,6.0,2.0
6,YDS,215.0,161.0,54.0,YDS,54.0,161.0,YDS,54.0,YDS,...,YDS,YDS,76.0,124.0,15.0,YDS,0.0,129.0,74.0,12.0
7,AVG,10.8,11.5,9.0,AVG,10.8,10.7,AVG,10.8,AVG,...,AVG,AVG,15.2,9.5,7.5,AVG,0.0,10.8,12.3,6.0
8,TD,0.0,0.0,0.0,TD,0.0,0.0,TD,0.0,TD,...,TD,TD,0.0,0.0,0.0,TD,0.0,0.0,0.0,0.0
9,LNG,29.0,29.0,15.0,LNG,19.0,29.0,LNG,19.0,LNG,...,LNG,LNG,25.0,29.0,9.0,LNG,0.0,25.0,29.0,7.0


# RBs, WRs, FBs

In [17]:
import httplib2
from bs4 import BeautifulSoup
import requests
import pandas as pd


URL = "https://www.espn.com/nfl/player/splits/_/id/3116389/samaje-perine"

page = requests.get(URL)

http = httplib2.Http()
status, response = http.request(URL)

soup = BeautifulSoup(response, "html.parser")

status_code = status['status']
# We want to get a status code of '200'
print(status_code)

mydivs = soup.find_all("tbody", {"class":"Table__TBODY"})


rows = mydivs[0].find_all('tr')
col_list = []
for row in rows:
    # Dictionary Keys
    col_list.append(row.text)

rows = mydivs[1].find_all('tr')

dict = {}
for index, row in enumerate(rows):
    data = row.find_all('td')
    # Dictionary Values (as list)
    try:
        float_value = float(data[0].text)
        #if float_value.is_integer():
            #print("data[0] is an integer.")
        #else:
            #print("data[0] is a floating-point number.")

    except ValueError:
        #print("data[0] is not a valid number.")
        temp_col_name = col_list[index]
    temp_list = []
    for d in data:
        # Add these to list
        temp_list.append(d.text)
    temp_list.append(temp_col_name)
    dict[col_list[index]] = temp_list

df = pd.DataFrame.from_dict(dict)
df = df.T

index_list = df.index.tolist()

nested_list = df.iloc[:,-1:].values.tolist()
flat_list = [item for sublist in nested_list for item in sublist]


arrays = [
    flat_list,
    index_list
]

tuples = list(zip(*arrays))
index = pd.MultiIndex.from_tuples(tuples, names=["main", "sub"])
result_df = pd.DataFrame(df.iloc[:, :-1].values, index=index, columns=df.columns[:-1])

# Identify the unique 'main' categories
unique_main_categories = result_df.index.get_level_values('main').unique()

# Create a dictionary to store the tables
tables = {}

# Iterate through the unique 'main' categories and create tables
for main_category in unique_main_categories:
    sub_df = result_df.xs(main_category, level='main')
    
    # Set the first row as column names
    sub_df.columns = sub_df.iloc[0]
    
    # Drop the first row after setting it as column names
    sub_df = sub_df.iloc[1:]
    
    tables[main_category] = sub_df

# Iterate through the tables dictionary
for table_name, table_df in tables.items():
    # Access and print the current table
    print(f"Table: {table_name}")
    
    # Reset the index and rename the index level if needed
    table_df = table_df.reset_index()
    table_df = table_df.rename(columns={'sub': table_name})  # Rename the 'sub' column
    
    # Remove the name of the index
    table_df.index.name = None
    
    # Print the table with the updated column name
    print(table_df.to_string(index=False))
    
    print("\n")


200
Table: split
     split CAR YDS AVG TD LNG REC YDS  AVG TD LNG FUM LST
All Splits  26  98 3.8  0  12  20 215 10.8  0  29   2   2
      Home  17  77 4.5  0  12  14 161 11.5  0  29   1   1
      Away   9  21 2.3  0   5   6  54  9.0  0  15   1   1


Table: Outcome
  Outcome CAR YDS AVG TD LNG REC YDS  AVG TD LNG FUM LST
Wins/Ties   8  22 2.8  0   5   5  54 10.8  0  19   0   0
   Losses  18  76 4.2  0  12  15 161 10.7  0  29   2   2


Table: Victory Margin
Victory Margin CAR YDS AVG TD LNG REC YDS  AVG TD LNG FUM LST
           0-7   8  22 2.8  0   5   5  54 10.8  0  19   0   0


Table: Close
           Close CAR YDS  AVG TD LNG REC YDS AVG TD LNG FUM LST
   4th QTR, +/-7   4  17  4.3  0  12   1   4 4.0  0   4   0   0
Last 2 Min. Half   1  -1 -1.0  0  -1  11  79 7.2  0  15   1   1


Table: Season Games
Season Games CAR YDS AVG TD LNG REC YDS  AVG TD LNG FUM LST
         1-8  26  98 3.8  0  12  20 215 10.8  0  29   2   2


Table: Month
    Month CAR YDS AVG TD LNG REC YDS  AVG TD LNG FU

In [112]:
import httplib2
from bs4 import BeautifulSoup
import requests
import pandas as pd


URL = "https://www.espn.com/nfl/player/splits/_/id/4242358/matt-henningsen"

page = requests.get(URL)

http = httplib2.Http()
status, response = http.request(URL)

soup = BeautifulSoup(response, "html.parser")

status_code = status['status']
# We want to get a status code of '200'
print(status_code)

mydivs = soup.find_all("tbody", {"class":"Table__TBODY"})


rows = mydivs[0].find_all('tr')
col_list = []
for row in rows:
    # Dictionary Keys
    col_list.append(row.text)

rows = mydivs[1].find_all('tr')

dict = {}
for index, row in enumerate(rows):
    data = row.find_all('td')
    # Dictionary Values (as list)
    try:
        float_value = float(data[0].text)
        #if float_value.is_integer():
            #print("data[0] is an integer.")
        #else:
            #print("data[0] is a floating-point number.")

    except ValueError:
        #print("data[0] is not a valid number.")
        temp_col_name = col_list[index]
    temp_list = []
    for d in data:
        # Add these to list
        temp_list.append(d.text)
    temp_list.append(temp_col_name)
    dict[col_list[index]] = temp_list

df = pd.DataFrame.from_dict(dict)
df = df.T

index_list = df.index.tolist()

nested_list = df.iloc[:,-1:].values.tolist()
flat_list = [item for sublist in nested_list for item in sublist]


arrays = [
    flat_list,
    index_list
]

tuples = list(zip(*arrays))
index = pd.MultiIndex.from_tuples(tuples, names=["main", "sub"])
result_df = pd.DataFrame(df.iloc[:, :-1].values, index=index, columns=df.columns[:-1])

# Identify the unique 'main' categories
unique_main_categories = result_df.index.get_level_values('main').unique()

# Create a dictionary to store the tables
tables = {}

# Iterate through the unique 'main' categories and create tables
for main_category in unique_main_categories:
    sub_df = result_df.xs(main_category, level='main')
    
    # Set the first row as column names
    sub_df.columns = sub_df.iloc[0]
    
    # Drop the first row after setting it as column names
    sub_df = sub_df.iloc[1:]
    
    tables[main_category] = sub_df

# Iterate through the tables dictionary
for table_name, table_df in tables.items():
    # Access and print the current table
    print(f"Table: {table_name}")
    
    # Reset the index and rename the index level if needed
    table_df = table_df.reset_index()
    table_df = table_df.rename(columns={'sub': table_name})  # Rename the 'sub' column
    
    # Remove the name of the index
    table_df.index.name = None
    
    # Print the table with the updated column name
    print(table_df.to_string(index=False))
    
    print("\n")


ConnectionError: ('Connection aborted.', ConnectionResetError(10054, 'Eine vorhandene Verbindung wurde vom Remotehost geschlossen', None, 10054, None))

In [110]:
import httplib2
from bs4 import BeautifulSoup
import requests
import pandas as pd

# Function to process player data from a given URL
def process_player_data(player_url):
    page = requests.get(player_url)

    http = httplib2.Http()
    status, response = http.request(player_url)

    soup = BeautifulSoup(response, "html.parser")

    status_code = status['status']
    # We want to get a status code of '200'
    print(status_code)

    mydivs = soup.find_all("tbody", {"class": "Table__TBODY"})

    rows = mydivs[0].find_all('tr')
    col_list = []
    for row in rows:
        # Dictionary Keys
        col_list.append(row.text)

    try:
        rows = mydivs[1].find_all('tr')
    except IndexError:
        print(f"No data found for {player_url}")
        return  # Skip processing if no data rows are found

    data_dict = {}
    for index, row in enumerate(rows):
        data = row.find_all('td')
        # Dictionary Values (as list)
        try:
            float_value = float(data[0].text)
            # if float_value.is_integer():
            # print("data[0] is an integer.")
            # else:
            # print("data[0] is a floating-point number.")

        except ValueError:
            # print("data[0] is not a valid number.")
            temp_col_name = col_list[index]
        temp_list = []
        for d in data:
            # Add these to list
            temp_list.append(d.text)
        temp_list.append(temp_col_name)
        data_dict[col_list[index]] = temp_list

    df = pd.DataFrame.from_dict(data_dict)
    df = df.T

    index_list = df.index.tolist()

    nested_list = df.iloc[:, -1:].values.tolist()
    flat_list = [item for sublist in nested_list for item in sublist]

    arrays = [
        flat_list,
        index_list
    ]

    tuples = list(zip(*arrays))
    index = pd.MultiIndex.from_tuples(tuples, names=["main", "sub"])
    result_df = pd.DataFrame(df.iloc[:, :-1].values, index=index, columns=df.columns[:-1])

    # Identify the unique 'main' categories
    unique_main_categories = result_df.index.get_level_values('main').unique()

    # Create a dictionary to store the tables
    tables = {}

    # Iterate through the unique 'main' categories and create tables
    for main_category in unique_main_categories:
        sub_df = result_df.xs(main_category, level='main')

        # Set the first row as column names
        sub_df.columns = sub_df.iloc[0]

        # Drop the first row after setting it as column names
        sub_df = sub_df.iloc[1:]

        tables[main_category] = sub_df

    # Iterate through the tables dictionary and print the tables
    for table_name, table_df in tables.items():
        print(f"Table for {player_url}: {table_name}")
        table_df = table_df.reset_index()
        table_df = table_df.rename(columns={'sub': table_name})  # Rename the 'sub' column
        table_df.index.name = None
        print(table_df.to_string(index=False))
        print("\n")

# List of player URLs to process
player_urls = [
    "https://www.espn.com/nfl/player/splits/_/id/2612151/alex-singleton",
    "https://www.espn.com/nfl/player/splits/_/id/3919548/justin-strnad",
    "https://www.espn.com/nfl/player/splits/_/id/4037216/essang-bassey",
    "https://www.espn.com/nfl/player/splits/_/id/4689674/art-green",
    "https://www.espn.com/nfl/player/splits/_/id/4240707/damarri-mathis",
    "https://www.espn.com/nfl/player/splits/_/id/4567462/jaquan-mcmillian",
    "https://www.espn.com/nfl/player/splits/_/id/2971586/fabian-moreau",
    "https://www.espn.com/nfl/player/splits/_/id/4382401/riley-moss",
    "https://www.espn.com/nfl/player/splits/_/id/3134448/tremon-smith"
]

process_player_data()

# Process each player's data
# for player_url in player_urls:
#     process_player_data(player_url)


200
Table for https://www.espn.com/nfl/player/splits/_/id/2612151/alex-singleton: split
     split TOT SOLO AST SACK STF STFYDS FF FR KB INT YDS AVG TD LNG PD
All Splits  66   39  27    0   6      3  0  0  0   0   0 0.0  0   0  1
      Home  33   18  15    0   3      2  0  0  0   0   0 0.0  0   0  1
      Away  33   21  12    0   3      1  0  0  0   0   0 0.0  0   0  0


Table for https://www.espn.com/nfl/player/splits/_/id/2612151/alex-singleton: Outcome
  Outcome TOT SOLO AST SACK STF STFYDS FF FR KB INT YDS AVG TD LNG PD
Wins/Ties  23   10  13    0   3      2  0  0  0   0   0 0.0  0   0  0
   Losses  43   29  14    0   3      1  0  0  0   0   0 0.0  0   0  1


Table for https://www.espn.com/nfl/player/splits/_/id/2612151/alex-singleton: Victory Margin
Victory Margin TOT SOLO AST SACK STF STFYDS FF FR KB INT YDS AVG TD LNG PD
           0-7  23   10  13    0   3      2  0  0  0   0   0 0.0  0   0  0


Table for https://www.espn.com/nfl/player/splits/_/id/2612151/alex-singleton: Close

ConnectionError: ('Connection aborted.', ConnectionResetError(10054, 'Eine vorhandene Verbindung wurde vom Remotehost geschlossen', None, 10054, None))

In [111]:
import httplib2
from bs4 import BeautifulSoup
import requests
import pandas as pd
import os

# Function to sanitize player names for folder or file names
def sanitize_name(name):
    # Replace special characters with underscores
    return ''.join(c if c.isalnum() or c.isspace() else '_' for c in name)

# Function to process player data from a given URL
def process_player_data(player_name, player_url, base_folder):
    # Sanitize player name for folder creation
    sanitized_player_name = sanitize_name(player_name)
    
    # Create a folder with the player's name if it doesn't exist
    player_folder = os.path.join(base_folder, sanitized_player_name)
    if not os.path.exists(player_folder):
        os.makedirs(player_folder)
    
    # Construct the splits URL using the player's URL
    splits_url = player_url.replace("/player/", "/player/splits/")
    
    page = requests.get(splits_url)

    http = httplib2.Http()
    status, response = http.request(splits_url)

    soup = BeautifulSoup(response, "html.parser")

    status_code = status['status']
    # We want to get a status code of '200'
    print(status_code)

    mydivs = soup.find_all("tbody", {"class": "Table__TBODY"})

    rows = mydivs[0].find_all('tr')
    col_list = []
    for row in rows:
        # Dictionary Keys
        col_list.append(row.text)

    try:
        rows = mydivs[1].find_all('tr')
    except IndexError:
        print(f"No data found for {player_name} ({splits_url})")
        return  # Skip processing if no data rows are found

    data_dict = {}
    for index, row in enumerate(rows):
        data = row.find_all('td')
        
        # Check if there are enough columns in the row to process
        if len(data) < 1:
            continue  # Skip rows with insufficient data
        
        # Dictionary Values (as list)
        try:
            float_value = float(data[0].text)
            # if float_value.is_integer():
            # print("data[0] is an integer.")
            # else:
            # print("data[0] is a floating-point number.")

        except ValueError:
            # print("data[0] is not a valid number.")
            temp_col_name = col_list[index]
        temp_list = []
        for d in data:
            # Add these to list
            temp_list.append(d.text)
        temp_list.append(temp_col_name)
        data_dict[col_list[index]] = temp_list

    if not data_dict:
        print(f"No valid data found for {player_name} ({splits_url})")
        return  # Skip processing if no valid data is found

    df = pd.DataFrame.from_dict(data_dict)
    df = df.T

    index_list = df.index.tolist()

    nested_list = df.iloc[:, -1:].values.tolist()
    flat_list = [item for sublist in nested_list for item in sublist]

    arrays = [
        flat_list,
        index_list
    ]

    tuples = list(zip(*arrays))
    index = pd.MultiIndex.from_tuples(tuples, names=["main", "sub"])
    result_df = pd.DataFrame(df.iloc[:, :-1].values, index=index, columns=df.columns[:-1])

    # Identify the unique 'main' categories
    unique_main_categories = result_df.index.get_level_values('main').unique()

    # Create a dictionary to store the tables
    tables = {}

    # Iterate through the unique 'main' categories and create tables
    for main_category in unique_main_categories:
        sub_df = result_df.xs(main_category, level='main')

        # Set the first row as column names
        sub_df.columns = sub_df.iloc[0]

        # Drop the first row after setting it as column names
        sub_df = sub_df.iloc[1:]

        # Add a column with the player's name
        sub_df['Player Name'] = player_name

        tables[main_category] = sub_df

    # Iterate through the tables dictionary and save the tables in the player's folder
    for table_name, table_df in tables.items():
        # Sanitize the table name for the file name
        sanitized_table_name = sanitize_name(table_name)
        
        print(f"Saving table for {player_name}: {sanitized_table_name}")
        table_df = table_df.reset_index()
        table_df = table_df.rename(columns={'sub': sanitized_table_name})  # Rename the 'sub' column
        table_df.index.name = None
        table_df
        
        # Define the file path to save the table
        file_path = os.path.join(player_folder, f"{sanitized_table_name}.csv")
        
        # Save the table as a CSV file
        table_df.to_csv(file_path, index=False)
        print(f"Table saved at {file_path}")
        print("\n")

# # Dictionary of player names and their corresponding links
# player_links_dict = {
#     "Garett Bolles": "https://www.espn.com/nfl/player/_/id/4035662/garett-bolles",
#     "Cam Fleming": "https://www.espn.com/nfl/player/_/id/16932/cam-fleming",
#     "Demontrey Jacobs": "https://www.espn.com/nfl/player/_/id/4250935/demontrey-jacobs",
#     "Mike McGlinchey": "https://www.espn.com/nfl/player/_/id/3052885/mike-mcglinchey",
#     "Alex Palczewski": "https://www.espn.com/nfl/player/_/id/4240551/alex-palczewski",
#     "Will Sherman": "https://www.espn.com/nfl/player/_/id/4243186/will-sherman",
#     "Zach Allen": "https://www.espn.com/nfl/player/_/id/3915282/zach-allen",
#     "Matt Henningsen": "https://www.espn.com/nfl/player/_/id/4242358/matt-henningsen",
#     "Ronnie Perkins": "https://www.espn.com/nfl/player/_/id/4360274/ronnie-perkins",
#     "Elijah Garcia": "https://www.espn.com/nfl/player/_/id/4039170/elijah-garcia",
#     "Jonathan Harris": "https://www.espn.com/nfl/player/_/id/4422407/jonathan-harris",
#     "Ben DiNucci": "https://www.espn.com/nfl/player/_/id/3895785/ben-dinucci"  # Example for a player without splits
# }

# Base folder where player data will be saved
base_folder = "PlayerData"

# Process each player's data
for player_name, player_url in player_links_dict.items():
    process_player_data(player_name, player_url, base_folder)


KeyboardInterrupt: 

In [106]:
file_path

NameError: name 'file_path' is not defined

In [19]:
import httplib2
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

# Function to process player data from a given URL
def process_player_data(player_name, player_url):
    page = requests.get(player_url)

    http = httplib2.Http()
    status, response = http.request(player_url)

    soup = BeautifulSoup(response, "html.parser")

    status_code = status['status']
    # We want to get a status code of '200'
    print(status_code)

    mydivs = soup.find_all("tbody", {"class": "Table__TBODY"})

    rows = mydivs[0].find_all('tr')
    col_list = []
    for row in rows:
        # Dictionary Keys
        col_list.append(row.text)

    try:
        rows = mydivs[1].find_all('tr')
    except IndexError:
        print(f"No data found for {player_name} ({player_url})")
        return  # Skip processing if no data rows are found

    data_dict = {}
    current_table = None  # Variable to keep track of the current table
    for index, row in enumerate(rows):
        data = row.find_all('td')
        
        # Check if there are enough columns in the row to process
        if len(data) < 1:
            continue  # Skip rows with insufficient data
        
        # Check if the first column can be converted to integers
        try:
            int(data[0].text.split('-')[0])
            is_value = True
        except ValueError:
            is_value = False
        
        if is_value:
            # If it's a value, create a dictionary with column names as keys and data as values
            row_data = {col_name: data_element.text for col_name, data_element in zip(col_list, data)}

            # Convert the row_data dictionary to a DataFrame
            row_df = pd.DataFrame([row_data])

            # Append the row DataFrame to the current_table
            if current_table is None:
                current_table = row_df
            else:
                current_table = pd.concat([current_table, row_df], ignore_index=True)
        else:
            # If it's not a value, start a new table
            if current_table is not None:
                # Add a column with the player's name
                current_table['Player Name'] = player_name

                # Store the current table with a unique name
                table_name = f"Table {index}"
                data_dict[table_name] = current_table

            current_table = None

    # Store the last table if it exists
    if current_table is not None:
        # Add a column with the player's name
        current_table['Player Name'] = player_name

        # Store the current table with a unique name
        table_name = f"Table {len(data_dict)}"
        data_dict[table_name] = current_table

        print(data_dict)

    # Iterate through the tables dictionary and print the tables
    for table_name, table_df in data_dict.items():
        print(f"Table for {player_name}: {table_name}")
        table_df = table_df.reset_index(drop=True)
        print(table_df.to_string(index=False))
        print("\n")

        

# Dictionary of player names and their corresponding links
player_links_dict = {
    "Wil Lutz": "https://www.espn.com/nfl/player/splits/_/id/2985659/wil-lutz"
}

# Process each player's data
for player_name, player_url in player_links_dict.items():
    process_player_data(player_name, player_url)



ConnectionError: ('Connection aborted.', ConnectionResetError(10054, 'Eine vorhandene Verbindung wurde vom Remotehost geschlossen', None, 10054, None))

## Kicker Working

### Get Row Names

In [105]:
import httplib2
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

# Function to process player data from a given URL
page = requests.get("https://www.espn.com/nfl/player/splits/_/id/2985659/wil-lutz")

http = httplib2.Http()
status, response = http.request(player_url)

soup = BeautifulSoup(response, "html.parser")

status_code = status['status']
# We want to get a status code of '200'
print(status_code)

mydivs = soup.find_all("tbody", {"class": "Table__TBODY"})

rows = mydivs[0].find_all('tr')
col_list = []
for row in rows:
    # Dictionary Keys
    col_list.append(row.text)

try:
    rows = mydivs[1].find_all('tr')
except IndexError:
    print(f"No data found for {player_name} ({player_url})")

200


['split',
 'All Splits',
 'Home',
 'Away',
 'Outcome',
 'Wins/Ties',
 'Losses',
 'Victory Margin',
 '0-7',
 'Close',
 '4th QTR, +/-7',
 'Last 2 Min. Half',
 'Season Games',
 '1-8',
 'Month',
 'September',
 'October',
 'DAY',
 'Sunday',
 'Other',
 'Surface',
 'Grass',
 'Location',
 'Outdoors',
 'Weather',
 '81+ F',
 'Depth Chart',
 'As Sub',
 'Group',
 'vs AFC',
 'vs NFC',
 'vs Div',
 'Opponent',
 'vs CHI',
 'vs GB',
 'vs KC',
 'vs MIA',
 'vs NYJ',
 'vs OAK',
 'vs WAS',
 'Half',
 '1st',
 '2nd',
 'Quarter',
 '1st',
 '2nd',
 '3rd',
 '4th',
 'Down',
 '2nd',
 '4th',
 'Pt Diff',
 'Ahead',
 'Behind',
 'Tied',
 'Field Position',
 'Own 21-50',
 'Opp 49-20',
 'Opp 19-Goal',
 'Opp 10-Goal']

### Collect Data In One Big Table

In [None]:
data_dict = {}
current_table = None  # Variable to keep track of the current table
for index, row in enumerate(rows):
    data = row.find_all('td')
    
    # Check if there are enough columns in the row to process
    if len(data) < 1:
        continue  # Skip rows with insufficient data
    
    # Check if the first column can be converted to integers
    try:
        int(data[0].text.split('-')[0])
        is_value = True
    except ValueError:
        is_value = False
    
    if is_value:
        # If it's a value, create a dictionary with column names as keys and data as values
        row_data = {col_name: data_element.text for col_name, data_element in zip(col_list, data)}

        # Convert the row_data dictionary to a DataFrame
        row_df = pd.DataFrame([row_data])

        # Append the row DataFrame to the current_table
        if current_table is None:
            current_table = row_df
        else:
            current_table = pd.concat([current_table, row_df], ignore_index=True)
    else:
        # If it's not a value, start a new table
        if current_table is not None:
            # Add a column with the player's name
            current_table['Player Name'] = player_name

            # Store the current table with a unique name
            table_name = f"Table {index}"
            data_dict[table_name] = current_table

        current_table = None
current_table["names"] = col_list
current_table

cols = current_table.columns.tolist()
cols = ['names'] + [col for col in cols if col != 'names']
current_table = current_table[cols]
#current_table['names'] = [value.title() for value in current_table['names']]
#current_table

### Split Table based on Categories

In [104]:
# Get Ids of rows to split on
splits_categories_kicker = ["split", "Outcome", "Victory Margin", "Close", "Season Games", "Month", "DAY", "Surface", "Location", "Weather", "Depth Chart", "Group", "Opponent", "Half", "Quarter", "Down", "PT Diff", "Field Position"]
split_ids = current_table.loc[current_table['names'].isin(splits_categories_kicker)].index.to_numpy()

# Split the data into separate tables and name them according to categories
tables = {}
for i in range(len(split_ids) - 1):
    start_idx = split_ids[i]
    end_idx = split_ids[i + 1]
    table = current_table[start_idx:end_idx]
    
    # Set the first row as column names
    table.columns = table.iloc[0]
    
    # Remove the first row
    table = table.tail(-1)
    
    # Name the table based on the category
    category_name = splits_categories[i]
    tables[category_name] = table

# Add the last table
last_table = current_table[split_ids[-1]:]
last_table.columns = last_table.iloc[0]
last_table = last_table.tail(-1)

# Name the last table based on the last category
last_category_name = splits_categories[-1]
tables[last_category_name] = last_table

# Remove the index from column names for each table in the 'tables' dictionary
for category_name, table in tables.items():
    # Reset the index and drop the old index column
    table = table.reset_index(drop=True)
    # Update the table in the 'tables' dictionary
    tables[category_name] = table

    # Get the current column names
    current_columns = table.columns
    # Remove the name of the index
    current_columns.name = None
    # Update the table in the 'tables' dictionary with updated column names
    table.columns = current_columns

# To access a specific table, e.g., the table for "Outcome":
#pd.DataFrame(tables["Surface"])
