In [14]:
### Notebook for scraping college football data from sports-reference.com

# Dependencies
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

import time
import random

import tqdm as tqdm


# URL of page to be scraped
BASE_URL ='https://www.sports-reference.com/cfb/years/'
END_URL = '-schedule.html'

year_list = list(range(1950, 1970))

min_dealy = 3 # site terms stay they kick if request >20 per minute

# https://www.sports-reference.com/cfb/years/1988-schedule.html

In [16]:
## Block for scraping Wikipedia for List of d1 college football teams

# URL of page to be scraped
# https://en.wikipedia.org/wiki/List_of_NCAA_Division_I_FBS_football_programs

# Extract table from wikipedia page
wiki_url = 'https://en.wikipedia.org/wiki/List_of_NCAA_Division_I_FBS_football_programs'
wiki_table = pd.read_html(wiki_url)[0]

wiki_table.head()

## save list of teams to csv
wiki_table.to_csv('TEMP/cfb_d1_teams.csv', index=False)

In [2]:
import time
def scrape_year_data(year):
    """Scrape game data for a given year from sports-reference.com."""
    # Construct URL for the given year
    url = BASE_URL + str(year) + END_URL
    
    # Try reading the table into a pandas DataFrame
    try:
        df = pd.read_html(url)
        df = pd.DataFrame(df[0])  # Convert response from list to dataframe
    except:
        return None  # Return None if scraping fails
    
    # Clean the dataframe of rows that are not games
    df = df[df['Wk'] != 'Wk']
    
    # Add a 'Year' column to the dataframe to keep track of the year
    df['Year'] = year
    
    return df

# This is the finalized scraping function.


def scrape_all_years(start_year, end_year):
    """Scrape game data for all years in the given range."""
    
    # Initialize an empty master dataframe
    master_df = pd.DataFrame()
    
    for year in range(start_year, end_year + 1):
        # Scrape data for the current year
        year_data = scrape_year_data(year)
        
        if year_data is not None:
            # Append the data to the master dataframe
            master_df = master_df.append(year_data, ignore_index=True)
        
        # Save to CSV every 10 years as a backup
        if year % 10 == 0:
            master_df.to_csv(f"TEMP\data_backup_{year}.csv", index=False)
        
        # Sleep for 3 seconds to respect rate limits
        time.sleep(3.5)
    
    # Save the entire data at the end
    master_df.to_csv("all_years_data.csv", index=False)
    
    return master_df

# The function is ready to be used. You can call it with the desired year range when running it on your machine.


In [6]:
data = scrape_all_years(1869, 1899)

  master_df = master_df.append(year_data, ignore_index=True)
  master_df = master_df.append(year_data, ignore_index=True)
  master_df = master_df.append(year_data, ignore_index=True)
  master_df = master_df.append(year_data, ignore_index=True)
  master_df = master_df.append(year_data, ignore_index=True)
  master_df = master_df.append(year_data, ignore_index=True)
  master_df = master_df.append(year_data, ignore_index=True)
  master_df = master_df.append(year_data, ignore_index=True)
  master_df = master_df.append(year_data, ignore_index=True)
  master_df = master_df.append(year_data, ignore_index=True)
  master_df = master_df.append(year_data, ignore_index=True)
  master_df = master_df.append(year_data, ignore_index=True)
  master_df = master_df.append(year_data, ignore_index=True)
  master_df = master_df.append(year_data, ignore_index=True)
  master_df = master_df.append(year_data, ignore_index=True)
  master_df = master_df.append(year_data, ignore_index=True)
  master_df = master_df.

In [11]:
### DATA TRANFORMATION
# Dictionary to store unique IDs for schools
school_id_dict = {}


# Reorder the columns to match the desired configuration
ordered_cols = ['HostSchoolId', 'HostScore', 'HostTeamName', 'AwaySchoolId', 'AwayScore', 
                'AwayTeamName', 'ContestNotes', 'HostResult', 'AwayResult', 'Wk', 
                'Date', 'Day', 'Winner', 'Pts', 'Loser', 'Pts.1', 'Notes', 'Year']


# Function to generate unique IDs for school names
def generate_school_id(name, school_id_dict):
    if name not in school_id_dict:
        school_id_dict[name] = len(school_id_dict) + 1
    return school_id_dict[name]

def transform_data_neutral(row, school_id_dict):
    # Check if the game is a neutral site game
    if row['Unnamed: 6'] == 'N' or (pd.notnull(row['Unnamed: 6']) and row['Unnamed: 6'] != '@'):
        row['HostTeamName'] = row['Winner']
        row['HostScore'] = row['Pts']
        row['AwayTeamName'] = row['Loser']
        row['AwayScore'] = row['Pts.1']
        row['HostResult'] = 'Neutral'
        row['AwayResult'] = 'Neutral'
    # Check if the winner was the away team
    elif row['Unnamed: 6'] == '@':
        row['HostTeamName'] = row['Loser']
        row['HostScore'] = row['Pts.1']
        row['AwayTeamName'] = row['Winner']
        row['AwayScore'] = row['Pts']
        row['HostResult'] = 'Loss'
        row['AwayResult'] = 'Win'
    else:
        row['HostTeamName'] = row['Winner']
        row['HostScore'] = row['Pts']
        row['AwayTeamName'] = row['Loser']
        row['AwayScore'] = row['Pts.1']
        row['HostResult'] = 'Win'
        row['AwayResult'] = 'Loss'
    
    # Generate unique IDs for schools
    row['HostSchoolId'] = generate_school_id(row['HostTeamName'], school_id_dict)
    row['AwaySchoolId'] = generate_school_id(row['AwayTeamName'], school_id_dict)
    
    row['ContestNotes'] = row['Notes']
    
    return row


In [13]:

# Apply the transformations to each row of the dataframe with neutral site games handling
transformed_data = data.apply(lambda row: transform_data_neutral(row, school_id_dict), axis=1)

# Drop the 'Rk' and 'Unnamed: 6' columns
transformed_data = transformed_data.drop(columns=['Rk', 'Unnamed: 6'])

# Reorder the columns to match the desired configuration
transformed_data = transformed_data[ordered_cols]
transformed_data.head(20)



Unnamed: 0,HostSchoolId,HostScore,HostTeamName,AwaySchoolId,AwayScore,AwayTeamName,ContestNotes,HostResult,AwayResult,Wk,Date,Day,Winner,Pts,Loser,Pts.1,Notes,Year
0,1,6,Rutgers,2,4,Princeton,,Win,Loss,1,"Nov 6, 1869",Sat,Rutgers,6,Princeton,4,,1869
1,2,8,Princeton,1,0,Rutgers,,Win,Loss,2,"Nov 13, 1869",Sat,Princeton,8,Rutgers,0,,1869
2,1,6,Rutgers,3,3,Columbia,,Win,Loss,1,"Nov 5, 1870",Sat,Rutgers,6,Columbia,3,,1870
3,2,6,Princeton,1,2,Rutgers,,Win,Loss,2,"Nov 12, 1870",Sat,Princeton,6,Rutgers,2,,1870
4,3,0,Columbia,1,0,Rutgers,,Loss,Win,1,"Nov 2, 1872",Sat,Rutgers,0,Columbia,0,,1872
5,1,7,Rutgers,3,5,Columbia,,Win,Loss,2,"Nov 9, 1872",Sat,Rutgers,7,Columbia,5,,1872
6,2,4,Princeton,1,1,Rutgers,,Win,Loss,3,"Nov 16, 1872",Sat,Princeton,4,Rutgers,1,,1872
7,4,3,Yale,3,0,Columbia,,Win,Loss,3,"Nov 16, 1872",Sat,Yale,3,Columbia,0,,1872
8,3,6,Columbia,5,0,Stevens,,Win,Loss,4,"Nov 23, 1872",Sat,Columbia,6,Stevens,0,,1872
9,5,6,Stevens,6,1,New York University,,Win,Loss,1,"Oct 18, 1873",Sat,Stevens,6,New York University,1,,1873


In [None]:
### Explore html structure

ex_url = 'https://www.sports-reference.com/cfb/years/1938-schedule.html'

## Try using pandas to read in the table

df = pd.read_html(ex_url)



# covert response from list to dataframe
df = pd.DataFrame(df[0])

df.head()


In [None]:
df.info()

# Clean the dataframe of rows that are not games
# drop rows where Wk is 'Wk'
df = df[df['Wk'] != 'Wk']

df.head()

In [None]:
### V1 Unfiniwshed, unused

### A script that takes the loops through all years in the range and scrapes the data
### creates a single dataframe and saves it as a csv

# # It also saves a temp csv every 10 years in case the script fails because of rate limits

# def scrape_year(year):
#     url = BASE_URL + str(year) + END_URL
#     df = pd.read_html(url)
#     df = pd.DataFrame(df[0])
#     df = df[df['Wk'] != 'Wk']
#     df['Year'] = year
#     return df

# # Create a dataframe to hold the data
# df = pd.DataFrame()

# # Loop through the years and scrape the data
# for year in year_list:
#     # Delay to stay under 20 requests per minute
#     time.sleep(random.randint(3, 5))
#     df = df.append(scrape_year(year))
#     if year % 10 == 0:
#         df.to_csv('TEMP\VS_temp.csv')
#     print(f'Year {year} complete')

# # Save the final dataframe as a csv
# df.to_csv('cfb_data.csv')

