In [5]:
import pandas as pd
import numpy as np
import json
import os

In [3]:
import requests
from bs4 import BeautifulSoup

# URL of the page you want to scrape
url = "https://www.ncaa.com/scoreboard/soccer-men/d1/2024/09/18"

# Send a request to fetch the HTML content of the page
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all 'a' tags with class 'gamePod-link'
    game_links = soup.find_all('a', class_='gamePod-link')

    # Extract the href attribute from each 'a' tag
    hrefs = [link['href'] for link in game_links]

    # Print the list of hrefs
    print(hrefs)
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")


['/game/6310249', '/game/6310253', '/game/6310252', '/game/6310256', '/game/6310254', '/game/6310255', '/game/6310250', '/game/6310251']


In [91]:
def clean_data(data):
    home_id = str(data['meta']['teams'][0]['id'])  # Convert to string
    away_id = str(data['meta']['teams'][1]['id'])  # Convert to string

    players_list = []

    for team in data['teams']:
        team_id = str(team['teamId'])  # Convert to string for comparison
        
        if team_id == home_id:
            team_type = 'Home'
        elif team_id == away_id:
            team_type = 'Away'
        else:
            print(f"Unexpected Team ID: {team_id}")
            continue

        for player in team['playerStats']:
            full_name = f"{player['firstName']} {player['lastName']}"
            minutes_played = player['minutesPlayed']

            players_list.append({
                'Name': full_name, 
                'Minutes Played': minutes_played, 
                'Team': team_type
            })

    return pd.DataFrame(players_list)

In [49]:
def create_matrix(df, goal_times):
    # Convert 'Minutes Played' to numeric, forcing errors to NaN
    df['Minutes Played'] = pd.to_numeric(df['Minutes Played'], errors='coerce')

    # Segment times up to 45 minutes and subtract from 90, then append 90 and insert 0
    segment_times = sorted(set([x for x in df['Minutes Played'] if x >= 45]))
    segment_times.insert(0, 0)

    # Normalize times to positive for the searchsorted
    goal_times = np.abs(goal_times)

    # Find the segment for each goal using searchsorted
    goal_segments = np.searchsorted(segment_times, goal_times)

    # Reapply negative sign to segments for away team goals
    goal_segments = np.where(np.array(goal_times) < 0, -goal_segments, goal_segments)

    # Prepare a list to store the results
    data = []

    # Loop through segments and calculate the goal difference
    for i in range(1, len(segment_times)):
        segment_time_start = segment_times[i-1]
        segment_time_end = segment_times[i]

        # Check if a goal occurred in this segment
        goal_diff = 0
        for j, goal_seg in enumerate(goal_segments):
            if abs(goal_seg) == i:  # If the goal falls in this segment
                if goal_times[j] > 0:  # Home goal
                    goal_diff += 1 / (segment_time_end - segment_time_start)
                else:  # Away goal
                    goal_diff -= 1 / (segment_time_end - segment_time_start)

        # Create a player status dictionary for this segment
        player_status = {}

        for name, minutes in zip(df['Name'], df['Minutes Played']):
            # Check if the player was on the field at this segment time
            if minutes > 45:
                if minutes >= segment_time_end:
                    player_status[name] = 1
                else:
                    player_status[name] = 0
            else:
                if minutes-1 > 90 - segment_time_end:
                    player_status[name] = 1
                else:
                    player_status[name] = 0


        # Append the segment, goal difference, and player status to the list
        data.append({
            'Segment Time': segment_time_end,
            'Goal Difference': goal_diff,
            **player_status  # Merge player status into the dictionary
        })

    # Convert the list of dictionaries to a DataFrame
    return pd.DataFrame(data)

In [31]:
def time_to_whole_minutes(time_str):
    minutes, _ = map(int, time_str.split(':'))  # Ignore seconds, only take minutes
    return minutes

In [43]:
def goal_segments(goals):
    home_team_id = None
    away_team_id = None

    for team in goals['meta']['teams']:
        if team['homeTeam'] == 'true':
            home_team_id = team['id']
        else:
            away_team_id = team['id']

    # Extract goal times and assign positive or negative value
    goal_minutes = []

    for period in goals['periods']:
        for event in period['summary']:
            if event['scoreType'] == 'GOAL':
                time_in_minutes = time_to_whole_minutes(event['time'])

                # Check if it's a home or away goal
                if event['teamId'] == home_team_id:
                    goal_minutes.append(time_in_minutes)  # Home team goal is positive
                elif event['teamId'] == away_team_id:
                    goal_minutes.append(-time_in_minutes)  # Away team goal is negative

    return list(set(goal_minutes))

In [96]:
def rapm_evaluate(matrix): 
    import numpy as np
    import pandas as pd
    from sklearn.linear_model import Ridge

    # Convert the 'Goal Difference' column to a 1D NumPy array
    goal_diff_per_90 = matrix['Goal Difference'].to_numpy().reshape(-1)

    # Design matrix X: Player involvement matrix (on/off the pitch)
    X = matrix.drop(['Goal Difference', 'Segment Time'], axis=1)

    # Regularized Adjusted Plus-Minus (RAPM) using Ridge Regression with specified solver
    ridge_model = Ridge(alpha=1.0, solver='svd')  # Try 'svd', or 'sag'
    ridge_model.fit(X, goal_diff_per_90)

    # The coefficients represent the adjusted plus-minus values for each player
    player_contributions = ridge_model.coef_

    # Store the adjusted plus-minus values for players
    player_ids = X.columns
    adjusted_plus_minus = pd.DataFrame({'player': player_ids, 'plus_minus': player_contributions})

    # Set display options for pandas to show more decimal places
    pd.set_option('display.float_format', '{:.6f}'.format)

    # Output the result
    return adjusted_plus_minus


In [114]:
teams_data = []

for game_id in hrefs:
    result = os.popen(
    f'curl https://data.ncaa.com/casablanca/{game_id}/boxscore.json'
    ).read()

    data = json.loads(result)
    
    result = os.popen(
    f'curl https://data.ncaa.com/casablanca/{game_id}/scoringSummary.json'
    ).read()
    
    # get the team names
    for team in data['meta']['teams']:
        if team['homeTeam'] == 'true':
            home_team = team['shortName']
        else:
            away_team = team['shortName']
    
    goals = json.loads(result)    
    
    data = clean_data(data)
    
    #rate the home team
    home_data = data[data['Team'] == 'Home']

    goal_times = goal_segments(goals)
    
    matrix = create_matrix(home_data, goal_times)
        
    ratings = rapm_evaluate(matrix)

    teams_data.append({
        home_team : ratings.to_dict(orient='records')
    })

    #rate the away team
    
    away_data = data[data['Team'] == 'Away']

    goal_times = goal_segments(goals)
    
    matrix = create_matrix(away_data, goal_times)
    
    ratings = rapm_evaluate(matrix)
    
    teams_data.append({
        away_team : ratings.to_dict(orient='records')
    })
    
    
    
    
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Minutes Played'] = pd.to_numeric(df['Minutes Played'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Minutes Played'] = pd.to_numeric(df['Minutes Played'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Minutes Played'] = pd.to_numeric(df['Minutes Play

In [115]:
teams_data

[{'St. Bonaventure': [{'player': 'Gabriel Addo',
    'plus_minus': -4.560117679651692e-18},
   {'player': 'Rashid Aroza', 'plus_minus': -0.02283195679796696},
   {'player': 'Keegan Dawson', 'plus_minus': -0.007994493858534535},
   {'player': 'Bo Drath', 'plus_minus': -2.5517457705394782e-18},
   {'player': 'Xaver Ehrlich', 'plus_minus': 3.6571219601989494e-18},
   {'player': 'Mattia Ferretti', 'plus_minus': -3.1792777189302775e-19},
   {'player': 'Luc Finelli', 'plus_minus': -0.007954786107581538},
   {'player': 'Andres Javitt', 'plus_minus': -0.0011515247776365885},
   {'player': 'Dean Mercer', 'plus_minus': 0.004539919525624741},
   {'player': 'Nicolas Pucci', 'plus_minus': 0.0},
   {'player': 'Mo Tall', 'plus_minus': 0.014771283354510802},
   {'player': 'Umechi Akuazaoku', 'plus_minus': 0.007954786107581538},
   {'player': 'Kenzo Brito', 'plus_minus': 0.007954786107581538},
   {'player': 'Daniel Helle', 'plus_minus': -0.014771283354510802},
   {'player': 'Callum Shillington', 'plus_