# Web Scraping Functions

In [61]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import numpy as np
from time import sleep
import pdb

In [62]:
# URL = 'https://www.sports-reference.com/cbb/postseason/2017-ncaa.html'
# page = requests.get(URL)

# soup = bs(page.content, 'html.parser')
# results = soup.find(id='brackets')
# print(results.get_text())

###### The get_table() function gets a table for a particular year and cleans it. For example, remove redundant columns and account for NaN values.

In [68]:
def get_table(year):
    """This function gets the table for the NCAA stats for
    the given year.
    
    Parameters:
        year (string): the year to get the data for
        
    Returns:
        df (pandas.Dataframe): the dataframe containing the stats for the given year
    """
    # construct url and get the table using pandas.read_html
    URL = 'https://www.sports-reference.com/cbb/seasons/{}-school-stats.html'.format(year)
    information = pd.read_html(URL)
    
    # save table as a DataFrame
    df = information[0]
    
    # Dropping the nonsense rows
    index1 = df[df.iloc[:,1] == 'School'].index
    index2 = df[df.iloc[:,2] == 'Overall'].index
    df.drop(index1, inplace=True)
    df.drop(index2, inplace=True)
    
    # We rename thje school column
    col = [' '.join(tup) for tup in df.columns]
    df.columns = col
    df = df.rename(columns={'Unnamed: 1_level_0 School':'School'})
    
    # We drop NaN columns
    col2 = [name for name in df.columns if name[:7] == "Unnamed"]
    df.drop(columns=col2, inplace=True)
    
    # We found more data that will not help us train
    drop = ['Overall SRS', 'Conf. W', 'Conf. L', 'Home W', 'Home L', 'Away W', 'Away L']
    df.drop(columns=drop, inplace=True)
    
    # These are the columns we normalize by number of games played
    normalize = ['Points Tm.', 'Points Opp.', 'Totals MP', 'Totals FG', 'Totals FGA', 
                 'Totals 3P', 'Totals 3PA', 'Totals FT', 'Totals FTA', 'Totals ORB', 
                 'Totals TRB', 'Totals AST', 'Totals STL', 'Totals BLK', 'Totals TOV', 'Totals PF']
    for i in normalize:
        df[i] = df[i].astype('float64')/df['Overall G'].astype('float64')
    
    # Identify which schools went to the NCAA tournament
    labels = ['NCAA' in school for school in df['School']]
    targets = pd.Series(labels, df.index)
    df['Class'] = targets
    
    # Drop the Total Minutes Played
    df.drop(['Totals MP'], axis=1, inplace=True)
    
    # Filling in missing data with mean of that column
    for n,i in enumerate(sum(df.isnull().values)):
        if i != 0:
            mean = df.iloc[:,n].astype('float64').mean()
            df.iloc[:,n].fillna(value=mean, inplace=True)
    
    # return the cleaned DataFrame
    return df

###### The get_seasons_data() function gets all tables for a range of years and saves them as a CSV file.

In [64]:
def get_seasons_data(start, end):
    """Get a range of seasons data and write it to CSV files
    
    Params:
        start (int): starting year
        end (int): ending year
        
    Returns:
        nothing
    """
    # create the list of years
    years = [i for i in range(start, end+1, 1)]
    
    # call get_table on every year in the list
    for year in years:
        table = get_table(year)
        table.to_csv(f'NCAA_Season_Stats_{year}.csv', index=False)
        sleep(np.random.randint(1, 10))
    

In [69]:
# we get all the data we need here
# get_seasons_data(1993,2021)

In [7]:
# This is where we could try and get data on their seeding
#info = pd.read_html('https://en.wikipedia.org/wiki/2012_NCAA_Division_I_Men%27s_Basketball_Tournament')