In [281]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import json
from tqdm import tqdm
from datetime import datetime
import numpy as np

In [348]:
# Function to get the HTML content of a given URL
def get_html_content(url):
    response = requests.get(url)
    return response.text

In [123]:
def split_on_dash(df, col):
    split_cols = df[col].str.split(' - ', expand = True)
    col_prefix, fields = col.lower().split(':')
    fields_list = [col.strip().replace(' ', '_') for col in fields.split('-')]
    new_col_names = [f"{col_prefix}_{field}" for field in fields_list]
    split_cols.columns = new_col_names
    return split_cols

In [174]:
# Function to scrape the team statistics for a particular year
def scrape_team_statistics(soup, year):
    table = soup.find("table", {"class": "team-statistics"})
    
    # Extract the header and data from the table
    data = pd.read_html(table.prettify(), flavor='bs4')[0]
    df = data.set_index(data.columns[0]).transpose()

    split_cols = [col for col in df.columns if ' - ' in col]
    
    for col in split_cols:
        df_col = split_on_dash(df, col)
        df[df_col.columns] = df_col
        df.drop(col, axis=1, inplace=True)
    
    df.columns = [x.lower().replace(":", "").replace(" / ", "per").replace(" ", "_").replace("/", "_per_").replace("%", "perc").replace("3rd", "third").replace("4th", "fourth").replace("2-", "two_") for x in df.columns]
    team_df = df[df.index.str.lower() != 'opponents'].copy().reset_index().rename(columns={'index': 'team_name'}).set_index(pd.Index([year]))
    opponents_df = df[df.index.str.lower() == 'opponents'].copy().set_index(pd.Index([year]))
    opponents_df.columns = ['opponent_' + col for col in opponents_df.columns]
    df = pd.concat([team_df, opponents_df], axis = 1).reset_index().rename(columns={'index': 'year'})
    df = df.apply(lambda x: None if x.item() == '-' else x)
    pct_cols = df.apply(lambda x: x.astype(str).str.contains('%')).any()
    df[pct_cols[pct_cols == True].index] = df[pct_cols[pct_cols == True].index].apply(lambda x: x.str.replace('%', '').astype(float) / 100)
    cols_to_convert = [col for col in df.columns if col not in ['year', 'team_name'] + pct_cols[pct_cols == True].index.tolist() and 'time' not in col]
    df[cols_to_convert] = df[cols_to_convert].apply(lambda x: x.str.replace(',', '').astype(float))
    return df

In [175]:
def get_years(team_id):
    url = f"http://cfbstats.com/2022/team/{team_id}/index.html"
    html_content = get_html_content(url)
    soup = BeautifulSoup(html_content, "html.parser")
    years = soup.find('div', {'id':'seasons'})
    return sorted([int(x.text) for x in years.find_all('li')])    

In [199]:
def get_teams_dict():
    with open('cfb_team_ids.json', 'r') as f:
        teams_dict = json.loads(f.read())
    return teams_dict

In [190]:
# Scrape the team statistics for each desired year
dataframes = []
teams_dict = get_teams_dict()
i = 0
for team_name, team_id in tqdm(teams_dict.items()):
    print(team_name, team_id)
    years = get_years(team_id)
    print(years)
    for year in years:
        url = f"http://cfbstats.com/{year}/team/{team_id}/index.html"
        if year == years[0] or year == years[-1]:
            print(f"{team_name}\t{year}\t{url}")
        html_content = get_html_content(url)
        soup = BeautifulSoup(html_content, "html.parser")
        df = scrape_team_statistics(soup, year)
        dataframes.append(df)
    if i % 5 ==0:
        pd.concat(dataframes, ignore_index=True).to_csv('cfb_stats_team_statistics.csv', index=False)
    i += 1
# Combine the dataframes
pd.concat(dataframes, ignore_index=True).to_csv('cfb_stats_team_statistics.csv', index=False)
final_df.head()

In [191]:
# url = "http://cfbstats.com"
# html_content = get_html_content(url)
# soup = BeautifulSoup(html_content, "html.parser")
regex = r'team\/([0-9]+)\/index'
results = [(s.text, re.search(regex, s['href'])) for s in soup.find_all('a', href=True)] # apply regex to each string
# valid_results = {r[0]:r[1].group(1) for r in results if r[1] is not None}
# with open('cfb_team_ids.json', 'w') as f:
#     f.write(json.dumps(valid_results))

In [338]:
# Function to scrape the team schedule for a particular year
def scrape_team_schedule(soup, year):
    table = soup.find("table", {"class": "team-schedule"})

    # Extract the header and data from the table
    df = pd.read_html(table.prettify(), flavor="bs4")[0]
    if df.shape[0] == 1:
        return_cols = ['date', 'opponent', 'attendance', 'home_away', 'win_lose',
       'points_scored', 'opponent_points_scored', 'game_duration', 'year',
       'team_name']
        df[return_cols] = None
        return df[return_cols]
    df = df[df.Date.str[0] != "@"]
    df["Date"] = pd.to_datetime(df["Date"], format="%m/%d/%y")
    df["home_away"] = "home"
    df.loc[df["home_away"].str.contains("@"), "home_away"] = "away"
    df.loc[df["home_away"].str.contains("\+"), "home_away"] = "neutral_site"
    df["Opponent"] = (
        df["Opponent"].str.replace("@", "").replace("+", "").apply(lambda x: x.strip())
    )

    split_results = df["Result"].str.split(" ", expand=True)
    df["win_lose"] = split_results.iloc[:, 0].apply(lambda x: x.strip())

    split_score = split_results.loc[:, 1].str.split("-", expand=True)
    df["points_scored"] = split_score.iloc[:, 0].astype(int)
    df["opponent_points_scored"] = split_score.iloc[:, 1].astype(int)

    df["Game Time"] = df["Game Time"].apply(
        lambda x: datetime.strptime(x, "%H:%M").time()
    )
    df['game_duration'] = df["Game Time"]
    df["game_duration_seconds"] = df['game_duration'].apply(lambda x: x.hour*3600 + x.minute*60)

    df.drop("Result", axis=1, inplace=True)
    df.drop("Game Time", axis=1, inplace=True)

    df["year"] = year
    team_name = soup.find("th", {"scope": "col", "class": "team-stat"})
    df["team_name"] = team_name.text
    df.columns = [x.lower().replace(" ", "_") for x in df.columns.tolist()]
    df['attendance'] = df['attendance'].astype(int)
    df['game_duration_seconds'] = df['game_duration_seconds'].astype(int)
    df['team_name'] = df['team_name'].astype(str)
    df['home_away'] = df['home_away'].astype(str)
    return df

In [383]:
# Function to scrape the team statistics for a particular year
def scrape_team_roster(soup, year, team_id):
    table = soup.find("table", {"class": "team-roster"})
    
    # Extract the header and data from the table
    df = pd.read_html(table.prettify(), flavor='bs4')[0]
    df["year"] = year
    team_name = soup.find("th", {"scope": "col", "class": "team-stat"})
    df["team_id"] = team_id
    
    player_ids = {x.find('a').text: re.search(r'\/([0-9]+)\/index', x.find('a')['href']).group(1) for x in soup.find_all('td', {'class': 'player-name'}) if x.find('a', href=True)}
    
    df["player_id"] = df["Name"].apply(lambda x: player_ids.setdefault(x, None))
    
    df.columns = [x.lower().replace(' ', '_') for x in df.columns.tolist()]
    return df

In [384]:
url = f"http://cfbstats.com/2020/team/{164}/index.html"
url = f"http://cfbstats.com/2020/team/{721}/roster.html"
html_content = get_html_content(url)
soup = BeautifulSoup(html_content, "html.parser")

In [385]:
# player_names = {x.find('a').text: re.search(r'\/([0-9]+)\/index', x.find('a')['href']).group(1) for x in soup.find_all('td', {'class': 'player-name'}) if x.find('a', href=True)}

In [386]:
df = scrape_team_roster(soup, 2022, 721)
df

Unnamed: 0,no,name,pos,yr,ht,wt,hometown,last_school,year,team_id,player_id
0,39,"Anderson, Matthew",DB,SR,6-2,195,"Fort Wayne, IN",Homestead,2022,721,1106132
1,56,"Beasley, Britton",OL,SR,6-1,330,"Cordell, OK",Cordell,2022,721,
2,96,"Bein, Charles",P,FR,6-3,190,"Mission Viejo, CA",San Clemente,2022,721,1115598
3,38,"Bentley, Brendan",LS,SO,6-2,250,"Las Vegas, NV",Sierra Vista,2022,721,1115611
4,55,"Blake, Kupono",-,-,-,-,-,-,2022,721,1115616
...,...,...,...,...,...,...,...,...,...,...,...
88,19,"Wills II, Eric",-,-,-,-,-,-,2022,721,1115606
89,9W,"Wilson, Wyatt",-,-,-,-,-,-,2022,721,
90,62,"Wimmer, Hawk",OG,JR,6-4,310,"Franklin, WI",Franklin,2022,721,
91,96,"Woodring, Joey",NG,SR,5-11,275,"Katy, TX",Katy,2022,721,1097559


In [201]:
# Scrape the team statistics for each desired year
dataframes = []
teams_dict = get_teams_dict()
i = 0
for team_name, team_id in tqdm(teams_dict.items()):
    print(team_name, team_id)
    years = get_years(team_id)
    print(years)
    for year in years:
        url = f"http://cfbstats.com/{year}/team/{team_id}/index.html"
        if year == years[0] or year == years[-1]:
            print(f"{team_name}\t{year}\t{url}")
        html_content = get_html_content(url)
        soup = BeautifulSoup(html_content, "html.parser")
        break
    break
    #     df = scrape_team_schedule(soup, year)
    #     dataframes.append(df)
    # if i % 5 ==0:
    #     pd.concat(dataframes, ignore_index=True).to_csv('cfb_stats_team_statistics.csv', index=False)
    # i += 1
# Combine the dataframes
# pd.concat(dataframes, ignore_index=True).to_csv('cfb_stats_team_statistics.csv', index=False)
# final_df.head()

  0%|                                                   | 0/131 [00:00<?, ?it/s]

Air Force 721
[2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]
Air Force	2009	http://cfbstats.com/2009/team/721/index.html


  0%|                                                   | 0/131 [00:00<?, ?it/s]
