In [2]:
# importing dependencies 
from selenium import webdriver 
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time


In [3]:
# setting up the Selenium WebDriver

# autimatically downloads and installs the latest chromedriver
service = Service(ChromeDriverManager().install()) 

# configures chrome options
options = webdriver.ChromeOptions()
options.add_argument("--headless")

# initilialize the chrome webdriver witht the specified service and options
driver = webdriver.Chrome(service=service, options=options)

# url of espn stats page to scrape
url = "https://www.espn.com/college-football/stats/team/_/view/defense"
driver.get(url)

# telling scraper to wait 5 seconds to load table
time.sleep(5)

# get the page source (HTML) after JavaScript execution
html = driver.page_source

# use pandas to extract all tables from the html 
dfs = pd.read_html(html)

# close the webdriver (IMPORTANT!!!)
driver.quit()

# print all extracted tables
for i, df in enumerate(dfs):
    print(f"table {i}:") # print table index
    print(df.head()) # printing the first few rows of each table

# select the relevant table for the extracted data
df = dfs[0]

  dfs = pd.read_html(html)


table 0:
         Ohio State Buckeyes
0           Indiana Hoosiers
1            Texas Longhorns
2  Northern Illinois Huskies
3   Minnesota Golden Gophers
4       Tennessee Volunteers
table 1:
  Unnamed: 0_level_0 Total        Passing        Rushing        Points      
                  GP   YDS  YDS/G     YDS  YDS/G     YDS  YDS/G    PTS PTS/G
0                 16  4074  254.6    2672  167.0    1402   87.6    206  12.9
1                 13  3332  256.3    2289  176.1    1043   80.2    203  15.6
2                 16  4539  283.7    2781  173.8    1758  109.9    245  15.3
3                 13  3700  284.6    2242  172.5    1458  112.2    241  18.5
4                 13  3714  285.7    2289  176.1    1425  109.6    220  16.9


In [4]:
schools_df = dfs[0]
stats_df = dfs[1]

In [5]:
column_name = schools_df.columns[0]


In [22]:
# inserting a blank row at the top and shift everything down
schools_df = schools_df.shift(1)

# shifting the frist column name to row 0
schools_df.iloc[0, 0] = column_name

# naming the new column "School"
schools_df.columns = ['School']

# reset index 
schools_df.reset_index(drop=True, inplace=True)

# showing the new df
schools_df.head()

Unnamed: 0,School
0,Ohio State Buckeyes
1,Indiana Hoosiers
2,Texas Longhorns
3,Northern Illinois Huskies
4,Minnesota Golden Gophers


In [18]:
stats_df.columns = ['_'.join(col) if col[0] else col[1] for col in stats_df.columns]
print(stats_df.head())

   Unnamed: 0_level_0_GP  Total_YDS  Total_YDS/G  Passing_YDS  Passing_YDS/G  \
0                     16       4074        254.6         2672          167.0   
1                     13       3332        256.3         2289          176.1   
2                     16       4539        283.7         2781          173.8   
3                     13       3700        284.6         2242          172.5   
4                     13       3714        285.7         2289          176.1   

   Rushing_YDS  Rushing_YDS/G  Points_PTS  Points_PTS/G  
0         1402           87.6         206          12.9  
1         1043           80.2         203          15.6  
2         1758          109.9         245          15.3  
3         1458          112.2         241          18.5  
4         1425          109.6         220          16.9  


In [19]:
stats_df.head()

Unnamed: 0,Unnamed: 0_level_0_GP,Total_YDS,Total_YDS/G,Passing_YDS,Passing_YDS/G,Rushing_YDS,Rushing_YDS/G,Points_PTS,Points_PTS/G
0,16,4074,254.6,2672,167.0,1402,87.6,206,12.9
1,13,3332,256.3,2289,176.1,1043,80.2,203,15.6
2,16,4539,283.7,2781,173.8,1758,109.9,245,15.3
3,13,3700,284.6,2242,172.5,1458,112.2,241,18.5
4,13,3714,285.7,2289,176.1,1425,109.6,220,16.9


In [20]:
stats_df = stats_df.rename(columns={'Unnamed: 0_level_0_GP': 'Games_Played'})
stats_df.head()

Unnamed: 0,Games_Played,Total_YDS,Total_YDS/G,Passing_YDS,Passing_YDS/G,Rushing_YDS,Rushing_YDS/G,Points_PTS,Points_PTS/G
0,16,4074,254.6,2672,167.0,1402,87.6,206,12.9
1,13,3332,256.3,2289,176.1,1043,80.2,203,15.6
2,16,4539,283.7,2781,173.8,1758,109.9,245,15.3
3,13,3700,284.6,2242,172.5,1458,112.2,241,18.5
4,13,3714,285.7,2289,176.1,1425,109.6,220,16.9


In [None]:
# inserting the scool column in the stats_df
stats_df.insert(0, 'School', schools_df['School'])
stats_df

In [28]:
cfb_def_2024_df = stats_df
cfb_def_2024_df.head(10)

Unnamed: 0,School,Games_Played,Total_YDS,Total_YDS/G,Passing_YDS,Passing_YDS/G,Rushing_YDS,Rushing_YDS/G,Points_PTS,Points_PTS/G
0,Ohio State Buckeyes,16,4074,254.6,2672,167.0,1402,87.6,206,12.9
1,Indiana Hoosiers,13,3332,256.3,2289,176.1,1043,80.2,203,15.6
2,Texas Longhorns,16,4539,283.7,2781,173.8,1758,109.9,245,15.3
3,Northern Illinois Huskies,13,3700,284.6,2242,172.5,1458,112.2,241,18.5
4,Minnesota Golden Gophers,13,3714,285.7,2289,176.1,1425,109.6,220,16.9
5,Tennessee Volunteers,13,3809,293.0,2461,189.3,1348,103.7,209,16.1
6,Penn State Nittany Lions,16,4716,294.8,3086,192.9,1630,101.9,264,16.5
7,Army Black Knights,14,4171,297.9,2603,185.9,1568,112.0,217,15.5
8,Ohio Bobcats,14,4264,304.6,2976,212.6,1288,92.0,254,18.1
9,Michigan Wolverines,13,3991,307.0,2812,216.3,1179,90.7,259,19.9


In [30]:
cfb_def_2024_df.to_csv('resources/cfb_def_2024.csv', index=False)