In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import json
import os
import requests
from bs4 import BeautifulSoup
import re
import unidecode

In [21]:
# URL of the page you want to scrape
def get_game_ids(day, division):
    
    url = f"https://www.ncaa.com/scoreboard/soccer-men/{division}/2024/{day}"
    print(url)

    # Send a request to fetch the HTML content of the page
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all 'a' tags with class 'gamePod-link'
        game_links = soup.find_all('a', class_='gamePod-link')

        # Extract the href attribute from each 'a' tag
        hrefs = [link['href'] for link in game_links]
        game_ids = [href.split('/')[2] for href in hrefs]

    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        
    return game_ids


In [3]:
def clean_data(data):
    home_id = str(data['meta']['teams'][0]['id'])  # Convert to string
    away_id = str(data['meta']['teams'][1]['id'])  # Convert to string
    home_name = data['meta']['teams'][0]['shortName']
    away_name = data['meta']['teams'][1]['shortName']

    players_list = []

    for team in data['teams']:
        team_id = str(team['teamId'])  # Convert to string for comparison
        
        if team_id == home_id:
            team_type = 'Home'
            team_name = home_name
        elif team_id == away_id:
            team_type = 'Away'
            team_name = away_name
        else:
            print(f"Unexpected Team ID: {team_id}")
            continue

        for player in team['playerStats']:
            full_name = f"{player['firstName']} {player['lastName']}"
            full_name = clean_name(full_name)

            # Ensure numeric values are properly cast to integers (or floats if needed)
            position = player['position']
            minutes_played = int(player['minutesPlayed'])
            goals = int(player['goals'])
            assists = int(player['assists'])
            shots = int(player['shots'])
            shots_on_target = int(player['shotsOnGoal'])
            yellow_cards = player['yellowCards']
            red_cards = player['redCards']

            players_list.append({
                'Name': full_name,
                'Position': position,
                'Minutes Played': minutes_played,
                'Goals': goals,
                'Assists': assists,
                'Shots': shots,
                'Shots On Target': shots_on_target,
                'Team': team_name
            })
            
    return players_list


In [4]:
def time_to_whole_minutes(time_str):
    minutes, _ = map(int, time_str.split(':'))  # Ignore seconds, only take minutes
    return minutes

In [5]:
def collect_data(game_ids):
    players_data = []

    for game_id in game_ids:
        data = None  # Initialize data variable

        try:
            # Attempt to make a GET request using the requests library
            response = requests.get(f'https://data.ncaa.com/casablanca/game/{game_id}/boxscore.json')
            response.raise_for_status()  # Raise an error for bad responses
            data = response.json()  # Parse the JSON data

        except requests.exceptions.RequestException as e:
            print(f"Error fetching data via requests for game ID {game_id}: {e}")

            try:
                # Fallback to using curl with os.popen
                result = os.popen(
                    f'curl https://data.ncaa.com/casablanca/game/{game_id}/boxscore.json'
                ).read()
                data = json.loads(result)  # Parse the JSON data

            except Exception as e:
                print(f"Error fetching data for game ID {game_id} using curl: {e}")
                continue  # Skip to the next game ID

        # Check if 'meta' key exists in the data
        if data is None or 'meta' not in data:
            print(f"Error: 'meta' key not found in data for game ID {game_id}")
            continue

        # Extract team information
        home_team = None
        away_team = None
        for team in data['meta']['teams']:
            if team['homeTeam'] == 'true':
                home_team = team['shortName']
            else:
                away_team = team['shortName']

        # Clean the data (assumed to be defined elsewhere)
        cleaned_data = clean_data(data)

        # Append cleaned data to players_data
        players_data.append(cleaned_data)

    return players_data


In [6]:
def categorize_event(event):
    if 'Goal by' in event:
        return 'Goal'
    elif 'Shot by' in event:
        return 'Shot'
    elif 'Foul on' in event:
        return 'Foul'
    elif 'Corner kick' in event:
        return 'Corner Kick'
    elif 'Offside' in event:
        return 'Offside'
    else:
        return 'Other'

In [7]:
def extract_player(event):
    # Patterns to capture both "Lastname, Firstname" and "Firstname Lastname"
    pattern = r'\b[A-Z][a-z]+,?\s*[A-Z][a-z]+'
    matches = re.findall(pattern, event)
    return matches[0] if matches else None

In [8]:
def collect_fouls_won(game_ids):
    foul_data = []
    for game_id in game_ids:
        data = None  # Initialize data variable

        try:
            # Attempt to make a GET request using the requests library
            response = requests.get(f'https://data.ncaa.com/casablanca/game/{game_id}/pbp.json')
            response.raise_for_status()  # Raise an error for bad responses
            data = response.json()  # Parse the JSON data

        except requests.exceptions.RequestException as e:
            print(f"Error fetching data via requests for game ID {game_id}: {e}")

            try:
                # Fallback to using curl with os.popen
                result = os.popen(
                    f'curl https://data.ncaa.com/casablanca/game/{game_id}/pbp.json'
                ).read()
                data = json.loads(result)  # Parse the JSON data

            except Exception as e:
                print(f"Error fetching data for game ID {game_id} using curl: {e}")
                continue  # Skip to the next game ID

        # Validate the JSON structure
        if not data or 'meta' not in data or 'periods' not in data:
            print(f"Invalid data for game ID {game_id}")
            continue

        # Extract team names
        home = data['meta']['teams'][0]['shortName']
        away = data['meta']['teams'][1]['shortName']

        events = []
        score = '0-0'  # Initialize the score
        for period in data['periods']:
            for play in period['playStats']:
                score = play['score'] if play['score'] else score
                time = play['time']

                if play['visitorText']:
                    team = 1
                    event = play['visitorText']
                else:
                    team = 0
                    event = play['homeText']

                event_details = {
                    'Score': score,
                    'Time': time,
                    'Event': event,
                    'Team': team
                }
                events.append(event_details)

        # Create a DataFrame for events
        df = pd.DataFrame(events)

        df['Name'] = df['Event'].apply(extract_player)  # Rename Player to Name
        df['Name'] = df['Name'].apply(clean_name)
        df['Event_Type'] = df['Event'].apply(categorize_event)
        df['Team'] = df['Team'].apply(lambda x: home if x == 0 else away)
        df['IsFoul'] = df['Event'].str.contains('Foul', case=False)

        # Filter fouls and summarize
        foul_df = df[df['IsFoul']]
        foul_summary = foul_df.groupby(['Name', 'Team']).size().reset_index(name='Fouls')
        # Append the foul summary for the game
        foul_data.append(foul_summary)

    # Combine all games' foul summaries into a single DataFrame
    if foul_data:
        all_fouls = pd.concat(foul_data, ignore_index=True)
        return all_fouls
    else:
        return pd.DataFrame(columns=['Name', 'Team', 'Fouls'])  # Return an empty DataFrame if no data

In [9]:
from unidecode import unidecode

def clean_name(name):
    if not name:
        return ""
    if ', ' in name:
        name = ' '.join(name.split(', ')[::-1])  # Reverse names if in "Last, First" format
    name = unidecode(name)  # Remove accents and special characters
    name = name.strip().title()  # Strip extra spaces and standardize capitalization
    return name

In [13]:
from rapidfuzz import process, fuzz
from unidecode import unidecode

# Preprocess names to simplify matching
def preprocess_name(name):
    if not name:
        return ""
    # Remove accents, extra spaces, and lowercase
    name = unidecode(name).strip().lower()
    return name

# Enhanced standardize_names function
def standardize_names(names, similarity_threshold=85):
    standardized_names = {}
    processed_names = [preprocess_name(name) for name in names]
    unique_processed_names = list(set(processed_names))

    for name in unique_processed_names:
        # Fuzzy match against already standardized names
        match = process.extractOne(name, list(standardized_names.keys()), scorer=fuzz.token_set_ratio)
        if match and match[1] > similarity_threshold:
            standardized_names[name] = standardized_names[match[0]]
        else:
            standardized_names[name] = name

    return [standardized_names[preprocess_name(name)] for name in names]

In [14]:
def create_name_mapping(*name_lists, similarity_threshold=90):
    all_names = set()
    for names in name_lists:
        all_names.update(preprocess_name(name) for name in names)
    
    standardized_names = {}
    unique_processed_names = list(all_names)

    for name in unique_processed_names:
        # Match with already standardized names
        match = process.extractOne(name, standardized_names.keys(), scorer=fuzz.WRatio)
        if match and match[1] > similarity_threshold:
            standardized_names[name] = standardized_names[match[0]]
        else:
            standardized_names[name] = name

    return standardized_names

# Apply the global mapping
def apply_name_mapping(names, name_mapping):
    return [name_mapping[preprocess_name(name)] for name in names]


In [15]:
start_date = '2024-08-22'
end_date = '2024-12-16'
# Generate a date range
date_range = pd.date_range(start=start_date, end=end_date)

# Extract month/day for each date
time_range = date_range.strftime('%m/%d').tolist()

In [22]:
# Initialize lists for storing data
dfs = []
fouls = []
division = 'd1'

# Iterate over the date range to collect game data
for day in time_range:
    game_ids = get_game_ids(day, division)
    # Collect player data and fouls
    players_data = collect_data(game_ids)
    fouls_won = collect_fouls_won(game_ids)

    # Flatten nested list of player data into a single list
    flattened_data = [player for game in players_data for player in game]
    
    # Create DataFrames for player data and fouls
    df = pd.DataFrame(flattened_data)
    fouls.append(fouls_won)
    dfs.append(df)
    print(day, "Done!")

# Concatenate all the collected data
dfs = pd.concat(dfs, ignore_index=True)
fouls = pd.concat(fouls, ignore_index=True)

# Clean and standardize names
dfs['Name'] = dfs['Name'].apply(clean_name)
fouls['Name'] = fouls['Name'].apply(clean_name)

# Collect all names for mapping
all_names = list(dfs['Name']) + list(fouls['Name'])
name_mapping = create_name_mapping(all_names)

# Standardize names using the unified mapping
dfs['Name'] = apply_name_mapping(dfs['Name'], name_mapping)
fouls['Name'] = apply_name_mapping(fouls['Name'], name_mapping)

# Group data by Name and Team
final_df = dfs.groupby(['Name', 'Team'], as_index=False).sum()
fouls = fouls.groupby(['Name', 'Team'], as_index=False).sum()

# Filter out invalid names
player_stats = final_df[final_df['Name'].notnull() & (final_df['Name'].str.strip() != '')]

# Merge fouls into player stats based on Name and Team
player_stats = pd.merge(player_stats, fouls, on=['Name', 'Team'], how='left', suffixes=('', '_fouls'))
player_stats['Fouls Won'] = player_stats['Fouls']
player_stats['Fouls Won'] = player_stats['Fouls Won'].fillna(0)

# Prepare the final player stats DataFrame
player_stats = player_stats[['Name', 'Team', 'Position', 'Minutes Played', 'Goals', 'Assists', 'Shots', 'Shots On Target', 'Fouls Won']]


https://www.ncaa.com/scoreboard/soccer-men/d1/2024/08/22
[]
08/22 Done!
https://www.ncaa.com/scoreboard/soccer-men/d1/2024/08/23
[]
08/23 Done!
https://www.ncaa.com/scoreboard/soccer-men/d1/2024/08/24
[]
08/24 Done!
https://www.ncaa.com/scoreboard/soccer-men/d1/2024/08/25
[]
08/25 Done!
https://www.ncaa.com/scoreboard/soccer-men/d1/2024/08/26
[]
08/26 Done!
https://www.ncaa.com/scoreboard/soccer-men/d1/2024/08/27
[]
08/27 Done!
https://www.ncaa.com/scoreboard/soccer-men/d1/2024/08/28
[]
08/28 Done!
https://www.ncaa.com/scoreboard/soccer-men/d1/2024/08/29
[]
08/29 Done!
https://www.ncaa.com/scoreboard/soccer-men/d1/2024/08/30
[]
08/30 Done!
https://www.ncaa.com/scoreboard/soccer-men/d1/2024/08/31
[]
08/31 Done!
https://www.ncaa.com/scoreboard/soccer-men/d1/2024/09/01
[]
09/01 Done!
https://www.ncaa.com/scoreboard/soccer-men/d1/2024/09/02
[]
09/02 Done!
https://www.ncaa.com/scoreboard/soccer-men/d1/2024/09/03


KeyboardInterrupt: 

In [17]:
df = player_stats.copy()

NameError: name 'player_stats' is not defined

In [None]:
from collections import Counter

In [None]:
# Update fouls based on subset matches
def update_fouls_for_subset_names(df, fouls):
    df = df.copy()
    # Iterate through player_stats where Fouls Won is 0
    for idx, row in df.iterrows():
        if row['Fouls Won'] == 0:
            # Search for names in fouls that are subsets of the player's name
            matching_fouls = fouls[
                (fouls['Team'] == row['Team']) &
                (fouls['Name'].apply(lambda x: row['Name'] in x or x in row['Name']))
            ]
            if not matching_fouls.empty:
                # Sum up fouls won for matching names
                total_fouls = matching_fouls['Fouls'].sum()
                df.at[idx, 'Fouls Won'] = total_fouls
    return df

# Apply the function
df = update_fouls_for_subset_names(df, fouls)


In [None]:
# Function to calculate dominant position
def dominant_position(pos):
    if pd.isna(pos) or not isinstance(pos, str) or len(pos) == 0:
        return "Unknown"  # Handle empty or invalid inputs
    
    # Define position mapping and their keywords
    position_keywords = {
        'Midfielder': ['M', 'Midfielder'],
        'Defender': ['D', 'Defender'],
        'Forward': ['F', 'Forward'],
        'Goalkeeper': ['G', 'Goalkeeper']
    }
    
    # Count occurrences of each position keyword in the string
    position_counts = {
        position: sum(pos.upper().count(keyword.upper()) for keyword in keywords)
        for position, keywords in position_keywords.items()
    }
    
    # Handle cases where no valid position-related keywords exist
    if all(count == 0 for count in position_counts.values()):
        return "Unknown"
    
    # Get the position with the maximum count
    dominant_position = max(position_counts, key=position_counts.get)
    return dominant_position

# Apply the function to the Position column
df['Position'] = df['Position'].apply(dominant_position)
df['Name'] = df['Name'].apply(lambda x: x.title())


In [None]:
pd.set_option('display.max_rows', None)

In [None]:
#df.to_csv('d2_player_stats.csv')