In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re

## Load results for season to get team links
# Read the CSV file
results = pd.read_csv('../../data/year_over_year/results_table_1901_2001_new.csv')

## FILTER FOR A SINGLE SEASON
# Make sure Date is a datetime object
results['Date'] = pd.to_datetime(results['Date'])

## Legacy Sub Functions

In [2]:
# Function to split "Last Team" into "Team" and "League" with edge case handling
def split_last_team(last_team):
    # Use regular expression to extract team and league
    match = re.search(r'(.+) \((.+)\)', last_team)
    if match:
        return match.groups()
    else:
        # If no league is specified, return the team as is and leave league blank
        return last_team, ""

# Updated function to correctly capture the player's position and handle edge cases in "Last Team"
def parse_and_transform_roster(html_content):
    # Initialize BeautifulSoup object
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find the table with the roster
    roster_table = soup.find('table', {'id': 'players'})

    # Check if the table exists
    if roster_table is None:
        print("Skipped: The table doesn't exist.")
        return None
    
    # Extract headers
    header_row = roster_table.find('thead').find('tr')
    headers = [header.text.strip() for header in header_row.find_all('th')]
    headers.append('Position')  # Add the Position column to headers
    
    # Initialize data list and current_position variable
    data = []
    current_position = None  # Initialize as None to later filter out irrelevant rows
    
    # Iterate through each row in the table
    for row in roster_table.find_all('tr'):
        if 'class' in row.attrs and 'stats-section' in row.attrs['class']:
            current_position = row.text.strip()
        else:
            cells = row.find_all('td')
            if cells and current_position:
                row_data = [cell.text.strip() for cell in cells]
                row_data.append(current_position)  # Add the current position to the row data
                data.append(row_data)
    
    # Create DataFrame
    df = pd.DataFrame(data, columns=headers)

    # Check if DataFrame is empty or if key columns are missing
    if df.empty or 'Last Team' not in df.columns or 'NHL Draft' not in df.columns:
        print("Skipped: The DataFrame is empty or missing key columns.")
        return None
    
    # Cleanup: Remove rows where 'No.' column is not numeric
    df = df[df['No.'].str.isnumeric()]
    
    # Cleanup: Drop the 'Pos' column
    df.drop(columns=['Pos'], inplace=True)
    
    # Transform Height to Inches
    df['Height_Inches'] = df['Ht.'].apply(convert_to_inches)

    # Transform NHL Draft to Draft_Year, NHL_Team, and D_Round
    # Transform NHL Draft to Draft_Year, NHL_Team, and D_Round
    draft_result = df['NHL Draft'].apply(split_nhl_draft)

    # Check if there are enough values to unpack
    if len(draft_result) > 0:
        df['Draft_Year'], df['NHL_Team'], df['D_Round'] = zip(*draft_result)
    else:
        # Handle the case when result is empty
        df['Draft_Year'], df['NHL_Team'], df['D_Round'] = [None] * len(df), [None] * len(df), [None] * len(df)

    df.drop(columns=['NHL Draft'], inplace=True) # Drop the original NHL Draft column
    
    # Handle edge cases in "Last Team" to split into "Team" and "League"
    df['Team'], df['League'] = zip(*df['Last Team'].apply(split_last_team))
    df.drop(columns=['Last Team'], inplace=True)
    
    # Rename the trouble column Hometown\nLast Team\nNHL Draft
    df.rename(columns={'Hometown\nLast Team\nNHL Draft': 'Hometown'}, inplace=True)

    # assign data types No. Wt. and Height_Inches to int, DOB to datetime
    int_list = ['No.', 'Wt.', 'Height_Inches']
    # Convert columns to numeric, coercing errors to NaN
    df[int_list] = df[int_list].apply(pd.to_numeric, errors='coerce')

    # Replace NaNs with a default value for specific columns
    default_value = 0
    df[int_list] = df[int_list].fillna(value=default_value)

    # Confirm that NaNs are filled
    print(df[int_list].isna().sum())  # Should output all zeros

    # Convert the columns to integers
    df[int_list] = df[int_list].astype(int)

    # df = df[df['Height_Inches'].notna()]  # Assuming convert_to_inches returns None for bad values

    
    return df

# Function to split "NHL Draft" into "Draft_Year", "NHL_Team", and "D_Round"
def split_nhl_draft(nhl_draft):
    try:
        draft_year, nhl_team, d_round = nhl_draft.split('-')
        return draft_year, nhl_team, d_round
    except ValueError:
        # Handle missing or incomplete data
        return None, None, None

# # Test the function
# test_values = ['2022-WSH-7', '', '2021-DET']
# [split_nhl_draft(val) for val in test_values]



# Function to convert height in "ft-in" format to total inches
def convert_to_inches(height_str):
    try:
        feet, inches = map(int, height_str.split('-'))
        return (feet * 12) + inches
    except ValueError:
        return None

# NEW

In [3]:
def parse_season_rosters(season):

    ## Load results for season to get team links
    # Read the CSV file
    results = pd.read_csv('../../data/year_over_year/results_table_1901_2001_new.csv')

    ## FILTER FOR A SINGLE SEASON
    # Make sure Date is a datetime object
    results['Date'] = pd.to_datetime(results['Date'])
    # String of the season
    season_str = str(season) + '-' + str(season+1)
    results = results[(results['Date'] > str(season-1)+'-08-01') & (results['Date'] < str(season)+'-07-31')]

    # Concatenate the unique values in 'Home_Team_Link' and 'Away_Team_Link'
    team_links = pd.concat([results['Home_Team_Link'], results['Away_Team_Link']])

    # Drop duplicates and NaN values
    team_links = team_links.drop_duplicates().dropna()

    # Clean up the links - drop everything before the third slash - keep the team name and the school ID as string
    team = team_links.str.split('/', expand=True)[3]
    # ORIGINAL
    number = team_links.str.split('/', expand=True)[4]

    # NEW - Create a string for the year '{season}{season+1}'
    year = str(season) + str(season+1)
    # reconstruct the link and store in a new column
    team_links = 'https://www.collegehockeynews.com/reports/roster/' + team + '/' + number + '/' + year

    # reset the index
    team_links = team_links.reset_index(drop=True)

    # Correcting the code to handle the 'Team' and 'Last Team' columns properly

    ## Loop through the team links and parse the roster data 
    ### Notes: save the roster link to the dataframe and add the team name and school ID

    roster_dfs = []  # Assuming this list exists to store each roster DataFrame

    # Extract team names from team_links
    team_names = pd.Series(team_links).str.split('/', expand=True)[5]


    for i, link in enumerate(team_links):
        print(f'Processing team {i+1} of {len(team_links)}')
        
        try:
            # Make GET request to team link
            r = requests.get(link)
            r.raise_for_status()  # This will raise an HTTPError if the HTTP request returned an unsuccessful status code
            html_content = r.text
            
            # Parse and transform the roster data
            roster_df = parse_and_transform_roster(html_content)
        
        except requests.RequestException as e:
            print(f"Error during the request for team {i+1} ({team_names.iloc[i]}): {e}")
            continue
        
        except Exception as e:
            print(f"Error processing the data for team {i+1} ({team_names.iloc[i]}): {e}")
            continue
        
        # Check if the DataFrame exists (i.e., the page had content)
        if roster_df is None:
            print(f"Skipping team {i+1} due to missing or empty data.")
            continue  # Skip this iteration and move to the next one

        try:
            # Reset the index if it's not unique
            roster_df.reset_index(drop=True, inplace=True)
            
            # Add the team name
            current_team = team_names.iloc[i]  # Extract the current team name
            roster_df['Current Team'] = current_team  # Add it to the DataFrame

            # Add a column with the season
            roster_df['Season'] = season
            
            # Add the roster DataFrame to the list
            roster_dfs.append(roster_df)
        
        except Exception as e:
            print(f"Error working with the DataFrame for team {i+1} ({team_names.iloc[i]}): {e}")
            continue

    # Assuming the last dataframe processed is the one of interest (can be adjusted later)
    # roster_df = roster_dfs[-1]

    # Add all the DataFrames in the list to a single DataFrame
    roster_df = pd.concat(roster_dfs, ignore_index=True)





    # Further transformations
    roster_df.columns = roster_df.columns.str.replace('.', '')  # Remove periods in column names
    roster_df['Name'] = roster_df['Name'].str.split(',').str[::-1].str.join(' ')  # Convert 'Last, First' to 'First Last'
    roster_df['Player'] = roster_df['Name']  # Store the Player name as 'First Last'
    roster_df['Player'] = roster_df['Player'].str.replace(u'\xa0', u' ').str.strip()  # Cleanup player name
    roster_df = roster_df.drop(['Name'], axis=1)  # Drop the original 'Name' column

    # Renaming 'Team' to 'Last Team' if it exists in the DataFrame
    if 'Team' in roster_df.columns:
        roster_df.rename(columns={'Team': 'Last Team'}, inplace=True)

    # Checking if 'Last Team' exists in the dataframe before reordering
    if 'Last Team' not in roster_df.columns:
        print("'Last Team' column not found in the DataFrame. Please check the data extraction process.")
    else:
        # Reorder columns without duplicate 'Team'
        roster_df = roster_df[['Current Team', 'Player', 'No', 'Position', 'Yr', 'Ht', 'Wt', 'DOB', 
                            'Hometown', 'Height_Inches', 'Draft_Year', 'NHL_Team', 'D_Round', 'Last Team', 'League']]

    roster_df.head() if 'Last Team' in roster_df.columns else None

    ## Tansforming some other output and save

    # Create a new column with the season
    roster_df['Season'] = season

    # roster_df.head()
    # Remove and dashes from the Current Team column
    roster_df['Current Team'] = roster_df['Current Team'].str.replace('-', ' ')
    # Rename to Team
    roster_df.rename(columns={'Current Team': 'Team'}, inplace=True)


    # Save the roster data to a CSV file
    roster_df.to_csv(f'../../TEMP/roster_data_{season}.csv', index=False)

    # 

In [5]:
## Create a loop to run the function for each season in a given range

# Define the range of seasons
start_year = 1902
end_year = 1970

# Loop through the range of seasons
for season in range(start_year, end_year):
    print(f"Processing season {season}-{season+1}")
    parse_season_rosters(season)

Processing season 1902-1903
Processing team 1 of 5
Skipped: The DataFrame is empty or missing key columns.
Skipping team 1 due to missing or empty data.
Processing team 2 of 5
No.              0
Wt.              0
Height_Inches    0
dtype: int64
Processing team 3 of 5
No.              0
Wt.              0
Height_Inches    0
dtype: int64
Processing team 4 of 5
Skipped: The DataFrame is empty or missing key columns.
Skipping team 4 due to missing or empty data.
Processing team 5 of 5
No.              0
Wt.              0
Height_Inches    0
dtype: int64
Processing season 1903-1904
Processing team 1 of 5
No.              0
Wt.              0
Height_Inches    0
dtype: int64
Processing team 2 of 5
No.              0
Wt.              0
Height_Inches    0
dtype: int64
Processing team 3 of 5
No.              0
Wt.              0
Height_Inches    0
dtype: int64
Processing team 4 of 5
Skipped: The DataFrame is empty or missing key columns.
Skipping team 4 due to missing or empty data.
Processing 