In [1]:
#### Gather, MSU's roster data for analysis

url = 'https://www.collegehockeynews.com/reports/roster/Michigan-State/32'

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re

response = requests.get(url)
html_content = response.text


In [2]:
# Updated function to parse the college hockey roster HTML table and perform transformations

# Function to split "Last Team" into "Team" and "League" with edge case handling
def split_last_team(last_team):
    # Use regular expression to extract team and league
    match = re.search(r'(.+) \((.+)\)', last_team)
    if match:
        return match.groups()
    else:
        # If no league is specified, return the team as is and leave league blank
        return last_team, ""

# Updated function to correctly capture the player's position and handle edge cases in "Last Team"
def parse_and_transform_roster(html_content):
    # Initialize BeautifulSoup object
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find the table with the roster
    roster_table = soup.find('table', {'id': 'players'})
    
    # Extract headers
    header_row = roster_table.find('thead').find('tr')
    headers = [header.text.strip() for header in header_row.find_all('th')]
    headers.append('Position')  # Add the Position column to headers
    
    # Initialize data list and current_position variable
    data = []
    current_position = None  # Initialize as None to later filter out irrelevant rows
    
    # Iterate through each row in the table
    for row in roster_table.find_all('tr'):
        if 'class' in row.attrs and 'stats-section' in row.attrs['class']:
            current_position = row.text.strip()
        else:
            cells = row.find_all('td')
            if cells and current_position:
                row_data = [cell.text.strip() for cell in cells]
                row_data.append(current_position)  # Add the current position to the row data
                data.append(row_data)
    
    # Create DataFrame
    df = pd.DataFrame(data, columns=headers)
    
    # Cleanup: Remove rows where 'No.' column is not numeric
    df = df[df['No.'].str.isnumeric()]
    
    # Cleanup: Drop the 'Pos' column
    df.drop(columns=['Pos'], inplace=True)
    
    # Transform Height to Inches
    df['Height_Inches'] = df['Ht.'].apply(convert_to_inches)

    # Transform NHL Draft to Draft_Year, NHL_Team, and D_Round
    df['Draft_Year'], df['NHL_Team'], df['D_Round'] = zip(*df['NHL Draft'].apply(split_nhl_draft))
    
    # Handle edge cases in "Last Team" to split into "Team" and "League"
    df['Team'], df['League'] = zip(*df['Last Team'].apply(split_last_team))
    df.drop(columns=['Last Team'], inplace=True)
    
    # Rename the trouble column Hometown\nLast Team\nNHL Draft
    df.rename(columns={'Hometown\nLast Team\nNHL Draft': 'Hometown'}, inplace=True)

    # assign data types No. Wt. and Height_Inches to int, DOB to datetime
    int_list = ['No.', 'Wt.', 'Height_Inches']
    df[int_list] = df[int_list].astype(int)

    df['DOB'] = pd.to_datetime(df['DOB'], errors='coerce')
    
    return df

# Function to split "NHL Draft" into "Draft_Year", "NHL_Team", and "D_Round"
def split_nhl_draft(nhl_draft):
    try:
        draft_year, nhl_team, d_round = nhl_draft.split('-')
        return draft_year, nhl_team, d_round
    except ValueError:
        # Handle missing or incomplete data
        return None, None, None

# # Test the function
# test_values = ['2022-WSH-7', '', '2021-DET']
# [split_nhl_draft(val) for val in test_values]



# Function to convert height in "ft-in" format to total inches
def convert_to_inches(height_str):
    try:
        feet, inches = map(int, height_str.split('-'))
        return (feet * 12) + inches
    except ValueError:
        return None
    
# Example of how to call the function with a URL (this is just a code snippet and won't run here)
# import requests
# url = "https://example.com/roster_page"

response = requests.get(url)
html_content = response.text
# df = parse_and_transform_roster(html_content)

# Use the function to parse and transform the roster using the already available HTML content
parsed_and_transformed_df = parse_and_transform_roster(html_content)
parsed_and_transformed_df.head()


Unnamed: 0,Unnamed: 1,No.,Name,Yr.,Ht.,Wt.,DOB,Hometown,NHL Draft,Position,Height_Inches,Draft_Year,NHL_Team,D_Round,Team,League
0,,9,"Basgall, Matt",So,5-9,190,2002-08-16,"Lake Forest, Ill.",,Defensemen,69,,,,Tri-City,USHL
1,,24,"Crossman, James",Sr,6-3,200,1998-11-23,"Denver, Colo.",,Defensemen,75,,,,Odessa,NAHL
2,,2,"Geary, Patrick",Fr,6-1,185,2004-02-18,"Hamburg, N.Y.",,Defensemen,73,,,,Waterloo,USHL
3,,7,"Gucciardi, David",Jr,6-1,190,2002-10-09,"Toronto, Ont.",2022-WSH-7,Defensemen,73,2022.0,WSH,7.0,Waterloo,USHL
4,,3,"Hurtig, Viktor",So,6-6,191,2002-04-28,"Falun, Sweden",,Defensemen,78,,,,Växjö Lakers HC J20,


In [3]:
df = parsed_and_transformed_df

len(df)

df.dtypes

                         object
No.                       int32
Name                     object
Yr.                      object
Ht.                      object
Wt.                       int32
DOB              datetime64[ns]
Hometown                 object
NHL Draft                object
Position                 object
Height_Inches             int32
Draft_Year               object
NHL_Team                 object
D_Round                  object
Team                     object
League                   object
dtype: object

In [4]:
## Parse the roster page with BeautifulSoup

roster_page = requests.get(roster_url_2023)
soup = BeautifulSoup(roster_page.content, 'html.parser')
roster_table = soup.find('table', {'id': 'players'})


# Initialize lists to hold the header and data
headers = []
data = []
current_position = ""  # To keep track of the current position (Forward, Defense, Goalie)


# Extract headers separately
header_row = roster_table.find('thead').find('tr')
headers = [header.text.strip() for header in header_row.find_all('th')]
headers.append('Position')  # Add the Position column to headers

# Iterate through each row in the table to extract data and keep track of position
for row in roster_table.find_all('tr'):
    if 'class' in row.attrs and 'header' in row.attrs['class']:
        current_position = row.text.strip()
    else:
        cells = row.find_all('td')
        if cells:
            row_data = [cell.text.strip() for cell in cells]
            row_data.append(current_position)
            data.append(row_data)

# Create a DataFrame from the parsed data
df = pd.DataFrame(data, columns=headers)





NameError: name 'roster_url_2023' is not defined

In [None]:

# Function to convert height in "ft-in" format to total inches
def convert_to_inches(height_str):
    try:
        feet, inches = map(int, height_str.split('-'))
        return (feet * 12) + inches
    except ValueError:
        return None

# Apply the conversion function to the 'Ht.' column and create a new column 'Height_Inches'
df['Height_Inches'] = df['Ht.'].apply(convert_to_inches)

# Display the DataFrame with the new 'Height_Inches' column



df.head()


In [None]:
roster_soup.text