In [64]:
import pandas as pd
import numpy as np

In [2]:
import requests
from bs4 import BeautifulSoup

# URL of the page you want to scrape
url = "https://www.ncaa.com/scoreboard/soccer-men/d1/2024/09/18"

# Send a request to fetch the HTML content of the page
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all 'a' tags with class 'gamePod-link'
    game_links = soup.find_all('a', class_='gamePod-link')

    # Extract the href attribute from each 'a' tag
    hrefs = [link['href'] for link in game_links]

    # Print the list of hrefs
    print(hrefs)
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")


['/game/6310249', '/game/6310253', '/game/6310252', '/game/6310256', '/game/6310254', '/game/6310255', '/game/6310250', '/game/6310251']


In [20]:
import re
import requests

from bs4 import BeautifulSoup

In [72]:
url = 'https://www.ncaa.com/game/6310251/boxscore'

In [73]:
headers = {
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36'
}

In [74]:
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")

In [142]:
import pandas as pd

# Raw data string
data = """Lucas Fernandez-kim	10	M	64	0	0	1	1	0	0 
Colton Franklin	8	null	48	0	0	0	0	0	0 
Cason Goodman	15	F	90	0	0	0	0	0	0 
Luke Goodman	14	D	85	0	0	1	0	0	0 
Carson Hammond	12	D	90	0	0	0	0	0	0 
Zack Lillington	23	M	90	0	0	1	0	0	0 
Cole Pond	27	D	90	0	0	1	0	0	0 
Chase Tanon	17	F	76	0	0	0	0	0	0 
Keegan Walwyn-bent	19	F	60	0	0	1	1	0	0 
Kevin Welch	13	M	54	0	0	1	1	0	0 
Mekhai Wilson	0	GK	90	0	0	0	0	0	0 
Scott Buie	9	D	27	0	0	1	1	0	0 
Andrew Dutra	7	D	28	0	0	0	0	0	0 
Gavin House	20	D	6	0	0	0	0	0	0 
Jason Hsu	25	D	43	0	0	0	0	0	0 
Rafael Matiello	18	M	5	0	0	0	0	0	0 
Zachary Neuls	28	null	35	0	0	0	0	0	0 
Ian Ngonethong	4	null	7	0	0	0	0	0	0 
Milan Uncanin	22	D	9	0	0	0	0	0	0"""

# Splitting the data into rows
rows = [row.split('\t') for row in data.split('\n')]

# Defining column names
columns = ['Name', 'Jersey Number', 'Position', 'Minutes Played', 'Goals', 'Assists', 'Shots on Goal', 'Shots', 'Yellow Cards', 'Red Cards']

# Creating DataFrame
df = pd.DataFrame(rows, columns=columns)



In [164]:
# Convert 'Minutes Played' to numeric, forcing errors to NaN
df['Minutes Played'] = pd.to_numeric(df['Minutes Played'], errors='coerce')

# Segment times up to 45 minutes and subtract from 90, then append 90 and insert 0
segment_times = sorted(set([x for x in df['Minutes Played'] if x >= 45]))
segment_times.insert(0, 0)

# Goals scored (positive for home team, negative for away team)
goals = [32, 54, -84]  # 84th minute goal by away team represented as -84

# Normalize times to positive for the searchsorted
goal_times = np.abs(goals)

# Find the segment for each goal using searchsorted
goal_segments = np.searchsorted(segment_times, goal_times)

# Reapply negative sign to segments for away team goals
goal_segments = np.where(np.array(goals) < 0, -goal_segments, goal_segments)

# Prepare a list to store the results
data = []

# Loop through segments and calculate the goal difference
for i in range(1, len(segment_times)):
    segment_time_start = segment_times[i-1]
    segment_time_end = segment_times[i]

    # Check if a goal occurred in this segment
    goal_diff = 0
    for j, goal_seg in enumerate(goal_segments):
        if abs(goal_seg) == i:  # If the goal falls in this segment
            if goals[j] > 0:  # Home goal
                goal_diff += 1 / (segment_time_end - segment_time_start)
            else:  # Away goal
                goal_diff -= 1 / (segment_time_end - segment_time_start)
                
    # Create a player status dictionary for this segment
    player_status = {}
                
    for name, minutes in zip(df['Name'], df['Minutes Played']):
        # Check if the player was on the field at this segment time
        if minutes > 45:
            if minutes >= segment_time_end:
                player_status[name] = 1
            else:
                player_status[name] = 0
        else:
            if minutes-1 > 90 - segment_time_end:
                player_status[name] = 1
            else:
                player_status[name] = 0
            
    
    # Append the segment, goal difference, and player status to the list
    data.append({
        'Segment Time': segment_time_end,
        'Goal Difference': goal_diff,
        **player_status  # Merge player status into the dictionary
    })

# Convert the list of dictionaries to a DataFrame
data_df = pd.DataFrame(data)

# Display the resulting DataFrame


In [167]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge

# Convert the 'Goal Difference' column to a 1D NumPy array
goal_diff_per_90 = data_df['Goal Difference'].to_numpy().reshape(-1)

# Design matrix X: Player involvement matrix (on/off the pitch)
X = data_df.drop(['Goal Difference', 'Segment Time'], axis=1)

# Regularized Adjusted Plus-Minus (RAPM) using Ridge Regression with specified solver
ridge_model = Ridge(alpha=1.0, solver='svd')  # Try 'svd', or 'sag'
ridge_model.fit(X, goal_diff_per_90)

# The coefficients represent the adjusted plus-minus values for each player
player_contributions = ridge_model.coef_

# Store the adjusted plus-minus values for players
player_ids = X.columns
adjusted_plus_minus = pd.DataFrame({'player': player_ids, 'plus_minus': player_contributions})

# Output the result
print(adjusted_plus_minus)

                 player    plus_minus
0   Lucas Fernandez-kim  7.894613e-03
1       Colton Franklin -2.701676e-02
2         Cason Goodman -5.596618e-19
3          Luke Goodman -1.818958e-02
4        Carson Hammond -4.087902e-18
5       Zack Lillington -6.710738e-19
6             Cole Pond  0.000000e+00
7           Chase Tanon  2.016323e-02
8    Keegan Walwyn-bent  1.141522e-02
9           Kevin Welch  3.776628e-02
10        Mekhai Wilson  0.000000e+00
11           Scott Buie -7.894613e-03
12         Andrew Dutra -1.141522e-02
13          Gavin House  1.818958e-02
14            Jason Hsu  2.701676e-02
15      Rafael Matiello  1.818958e-02
16        Zachary Neuls -3.776628e-02
17       Ian Ngonethong -2.016323e-02
18        Milan Uncanin -2.016323e-02


In [168]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge

# Convert the 'Goal Difference' column to a 1D NumPy array
goal_diff_per_90 = data_df['Goal Difference'].to_numpy().reshape(-1)

# Design matrix X: Player involvement matrix (on/off the pitch)
X = data_df.drop(['Goal Difference', 'Segment Time'], axis=1)

# Regularized Adjusted Plus-Minus (RAPM) using Ridge Regression with specified solver
ridge_model = Ridge(alpha=1.0, solver='svd')  # Try 'svd', or 'sag'
ridge_model.fit(X, goal_diff_per_90)

# The coefficients represent the adjusted plus-minus values for each player
player_contributions = ridge_model.coef_

# Store the adjusted plus-minus values for players
player_ids = X.columns
adjusted_plus_minus = pd.DataFrame({'player': player_ids, 'plus_minus': player_contributions})

# Set display options for pandas to show more decimal places
pd.set_option('display.float_format', '{:.6f}'.format)

# Output the result
print(adjusted_plus_minus)


                 player  plus_minus
0   Lucas Fernandez-kim    0.007895
1       Colton Franklin   -0.027017
2         Cason Goodman   -0.000000
3          Luke Goodman   -0.018190
4        Carson Hammond   -0.000000
5       Zack Lillington   -0.000000
6             Cole Pond    0.000000
7           Chase Tanon    0.020163
8    Keegan Walwyn-bent    0.011415
9           Kevin Welch    0.037766
10        Mekhai Wilson    0.000000
11           Scott Buie   -0.007895
12         Andrew Dutra   -0.011415
13          Gavin House    0.018190
14            Jason Hsu    0.027017
15      Rafael Matiello    0.018190
16        Zachary Neuls   -0.037766
17       Ian Ngonethong   -0.020163
18        Milan Uncanin   -0.020163


In [169]:
data_df

Unnamed: 0,Segment Time,Goal Difference,Lucas Fernandez-kim,Colton Franklin,Cason Goodman,Luke Goodman,Carson Hammond,Zack Lillington,Cole Pond,Chase Tanon,...,Kevin Welch,Mekhai Wilson,Scott Buie,Andrew Dutra,Gavin House,Jason Hsu,Rafael Matiello,Zachary Neuls,Ian Ngonethong,Milan Uncanin
0,48,0.020833,1,1,1,1,1,1,1,1,...,1,1,0,0,0,0,0,0,0,0
1,54,0.166667,1,0,1,1,1,1,1,1,...,1,1,0,0,0,1,0,0,0,0
2,60,0.0,1,0,1,1,1,1,1,1,...,0,1,0,0,0,1,0,1,0,0
3,64,0.0,1,0,1,1,1,1,1,1,...,0,1,0,1,0,1,0,1,0,0
4,76,0.0,0,0,1,1,1,1,1,1,...,0,1,1,1,0,1,0,1,0,0
5,85,-0.111111,0,0,1,1,1,1,1,0,...,0,1,1,1,0,1,0,1,1,1
6,90,0.0,0,0,1,0,1,1,1,0,...,0,1,1,1,1,1,1,1,1,1
