In [1]:
# importing dependencies 
from selenium import webdriver 
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

In [2]:
# setting up the Selenium WebDriver

# autimatically downloads and installs the latest chromedriver
service = Service(ChromeDriverManager().install()) 

# configures chrome options
options = webdriver.ChromeOptions()
options.add_argument("--headless")

# initilialize the chrome webdriver witht the specified service and options
driver = webdriver.Chrome(service=service, options=options)

# url of espn stats page to scrape
url = "https://www.espn.com/college-football/stats/team"
driver.get(url)

# telling scraper to wait 5 seconds to load table
time.sleep(5)

# get the page source (HTML) after JavaScript execution
html = driver.page_source

# use pandas to extract all tables from the html 
dfs = pd.read_html(html)

# close the webdriver (IMPORTANT!!!)
driver.quit()

# print all extracted tables
for i, df in enumerate(dfs):
    print(f"table {i}:") # print table index
    print(df.head()) # printing the first few rows of each table

# select the relevant table for the extracted data
df = dfs[0]
df.head()

  dfs = pd.read_html(html)


table 0:
         Miami Hurricanes
0         Ole Miss Rebels
1  North Texas Mean Green
2        New Mexico Lobos
3     Texas State Bobcats
4       Utah State Aggies
table 1:
  Unnamed: 0_level_0 Total        Passing        Rushing        Points      
                  GP   YDS  YDS/G     YDS  YDS/G     YDS  YDS/G    PTS PTS/G
0                 13  6983  537.2    4527  348.2    2456  188.9    571  43.9
1                 13  6846  526.6    4561  350.8    2285  175.8    502  38.6
2                 13  6355  488.8    4267  328.2    2088  160.6    436  33.5
3                 12  5811  484.3    2768  230.7    3043  253.6    402  33.5
4                 13  6200  476.9    3493  268.7    2707  208.2    475  36.5


Unnamed: 0,Miami Hurricanes
0,Ole Miss Rebels
1,North Texas Mean Green
2,New Mexico Lobos
3,Texas State Bobcats
4,Utah State Aggies


In [3]:
schools_df = dfs[0]
stats_df = dfs[1]

In [4]:
# inserting a blank row at the top anf shifting everything down
schools_df = schools_df.shift(1)

# shifting the first columnn name to row 0 
column_name = schools_df.columns[0]
schools_df.iloc[0, 0] = column_name

# naming the new column "School"
schools_df.columns = ['School']

# reset index
schools_df.reset_index(drop=True, inplace=True)

# showing the new df
schools_df.head()

Unnamed: 0,School
0,Miami Hurricanes
1,Ole Miss Rebels
2,North Texas Mean Green
3,New Mexico Lobos
4,Texas State Bobcats


In [5]:
# combining the names of the indexed columns 
stats_df.columns = [ '_'.join(col) if col[0] else col[1] for col in stats_df.columns]
print(stats_df.head())

   Unnamed: 0_level_0_GP  Total_YDS  Total_YDS/G  Passing_YDS  Passing_YDS/G  \
0                     13       6983        537.2         4527          348.2   
1                     13       6846        526.6         4561          350.8   
2                     13       6355        488.8         4267          328.2   
3                     12       5811        484.3         2768          230.7   
4                     13       6200        476.9         3493          268.7   

   Rushing_YDS  Rushing_YDS/G  Points_PTS  Points_PTS/G  
0         2456          188.9         571          43.9  
1         2285          175.8         502          38.6  
2         2088          160.6         436          33.5  
3         3043          253.6         402          33.5  
4         2707          208.2         475          36.5  


In [6]:
# renaming 'Unnamed: 0_level_0_GP' to 'Games_Played'
stats_df = stats_df.rename(columns={'Unnamed: 0_level_0_GP': 'Games_Played'})
stats_df.head()

Unnamed: 0,Games_Played,Total_YDS,Total_YDS/G,Passing_YDS,Passing_YDS/G,Rushing_YDS,Rushing_YDS/G,Points_PTS,Points_PTS/G
0,13,6983,537.2,4527,348.2,2456,188.9,571,43.9
1,13,6846,526.6,4561,350.8,2285,175.8,502,38.6
2,13,6355,488.8,4267,328.2,2088,160.6,436,33.5
3,12,5811,484.3,2768,230.7,3043,253.6,402,33.5
4,13,6200,476.9,3493,268.7,2707,208.2,475,36.5


In [7]:
# inserting the 'School' column frome the schools_df into the stats_df
stats_df.insert(0, 'School', schools_df['School'])
stats_df

Unnamed: 0,School,Games_Played,Total_YDS,Total_YDS/G,Passing_YDS,Passing_YDS/G,Rushing_YDS,Rushing_YDS/G,Points_PTS,Points_PTS/G
0,Miami Hurricanes,13,6983,537.2,4527,348.2,2456,188.9,571,43.9
1,Ole Miss Rebels,13,6846,526.6,4561,350.8,2285,175.8,502,38.6
2,North Texas Mean Green,13,6355,488.8,4267,328.2,2088,160.6,436,33.5
3,New Mexico Lobos,12,5811,484.3,2768,230.7,3043,253.6,402,33.5
4,Texas State Bobcats,13,6200,476.9,3493,268.7,2707,208.2,475,36.5
...,...,...,...,...,...,...,...,...,...,...
129,Northwestern Wildcats,12,3413,284.4,2231,185.9,1182,98.5,214,17.8
130,Southern Miss Golden Eagles,12,3406,283.8,2099,174.9,1307,108.9,183,15.3
131,Florida State Seminoles,12,3243,270.3,2164,180.3,1079,89.9,185,15.4
132,Kennesaw State Owls,12,3011,250.9,1636,136.3,1375,114.6,198,16.5


In [9]:
cfb_off_2024_df = stats_df
cfb_off_2024_df.head(10)

Unnamed: 0,School,Games_Played,Total_YDS,Total_YDS/G,Passing_YDS,Passing_YDS/G,Rushing_YDS,Rushing_YDS/G,Points_PTS,Points_PTS/G
0,Miami Hurricanes,13,6983,537.2,4527,348.2,2456,188.9,571,43.9
1,Ole Miss Rebels,13,6846,526.6,4561,350.8,2285,175.8,502,38.6
2,North Texas Mean Green,13,6355,488.8,4267,328.2,2088,160.6,436,33.5
3,New Mexico Lobos,12,5811,484.3,2768,230.7,3043,253.6,402,33.5
4,Texas State Bobcats,13,6200,476.9,3493,268.7,2707,208.2,475,36.5
5,Utah State Aggies,12,5620,468.3,3229,269.1,2391,199.3,383,31.9
6,Syracuse Orange,13,6079,467.6,4810,370.0,1269,97.6,443,34.1
7,Boise State Broncos,14,6524,466.0,3159,225.6,3365,240.4,522,37.3
8,Texas Tech Red Raiders,13,6016,462.8,3857,296.7,2159,166.1,489,37.6
9,Arkansas Razorbacks,13,5970,459.2,3571,274.7,2399,184.5,402,30.9


In [10]:
cfb_off_2024_df.to_csv('resources/cfb_off_2024.csv', index=False)