In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import json
import os
import requests
from bs4 import BeautifulSoup

In [2]:
def get_scores(day):
    url = f"https://www.ncaa.com/scoreboard/soccer-men/d1/2024/{day}/all-conf"

    # Send a request to fetch the HTML content of the page
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code != 200:
        print("Error fetching the page.")
        return []

    # Create a BeautifulSoup object
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all game pods with the final status
    games = soup.find_all('div', class_='gamePod gamePod-type-game status-final')

    scores = []  # List to hold score results

    # Iterate through each game and extract relevant information
    for game in games:
        teams = game.find_all('li')

        if len(teams) < 2:  # Ensure there are at least two teams
            continue

        # Away team information (always the first team listed)
        away_team_name = teams[0].find('span', class_='gamePod-game-team-name').text.strip()
        away_team_score = teams[0].find('span', class_='gamePod-game-team-score').text.strip()

        # Home team information (always the second team listed)
        home_team_name = teams[1].find('span', class_='gamePod-game-team-name').text.strip()
        home_team_score = teams[1].find('span', class_='gamePod-game-team-score').text.strip()
        
        # Append the result to the scores list
        scores.append({
            'home_team': home_team_name,
            'home_team_score': home_team_score,
            'away_team': away_team_name,
            'away_team_score': away_team_score,
        })

    return scores  # Return the scores list


In [3]:
start_date = '2024-08-22'
end_date = '2024-12-16'

# Generate a date range
date_range = pd.date_range(start=start_date, end=end_date)

# Extract month/day for each date
time_range = date_range.strftime('%m/%d').tolist()

dfs = []
for day in time_range:
    scores = get_scores(day)
    
    scores_df = pd.DataFrame(scores)
    
    dfs.append(scores_df)
    
dfs = pd.concat(dfs, ignore_index=True)

In [4]:
url = 'https://www.ncaa.com/rankings/soccer-men/d1/ncaa-mens-soccer-rpi'

import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_conferences(url):
    try:
        response = requests.get(url)

        # Check if the request was successful
        if response.status_code != 200:
            print("Error fetching the page.")
            return [], []

        # Create a BeautifulSoup object
        soup = BeautifulSoup(response.text, 'html.parser')
        
        stats_table = soup.find('table')
        if not stats_table:
            print("Statistics table not found.")
            return [], []

        # Extract headers
        headers = [header.text.strip() for header in stats_table.find_all('th')]

        # Extract rows
        rows = []
        for row in stats_table.find_all('tr')[1:]:  # skip the header row
            cols = row.find_all('td')
            cols = [ele.text.strip() for ele in cols]
            rows.append(cols)
        
        return headers, rows

    except requests.HTTPError as e:
        print(f'HTTP Error occurred: {e.response.status_code}')
        return [], []
    except requests.RequestException as e:
        print(f'Request exception: {e}')
        return [], []
    except Exception as e:
        print(f'An error occurred: {e}')
        return [], []

# Initialize an empty list to collect all the data
all_rows = []
all_headers = None

headers, rows = scrape_conferences(url)
    
if headers and rows:
    if all_headers is None:
        all_headers = headers  # Save headers once
    all_rows.extend(rows)  # Collect rows from all pages

if all_headers and all_rows:
    # Create a DataFrame from the concatenated data
    conferences = pd.DataFrame(all_rows, columns=all_headers)
else:
    print("No data was scraped.")
    
conferences = conferences[['School', 'Conference']]
conferences.columns = ['Team', 'Conference']

In [5]:
# First, merge to get the home_team_conference
dfs = dfs.merge(conferences, left_on='home_team', right_on='Team', how='left')
dfs = dfs.rename(columns={'Conference': 'home_team_conference'}).drop('Team', axis=1)

# Then, merge to get the away_team_conference
dfs = dfs.merge(conferences, left_on='away_team', right_on='Team', how='left')
dfs = dfs.rename(columns={'Conference': 'away_team_conference'}).drop('Team', axis=1)

In [6]:
results = dfs.fillna("Not D1")

In [7]:
results['home_team_score'] = results['home_team_score'].astype(int)
results['away_team_score'] = results['away_team_score'].astype(int)

In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import os

def get_game_ids(day, division):
    url = f"https://www.ncaa.com/scoreboard/soccer-men/{division}/2024/{day}"

    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        game_links = soup.find_all('a', class_='gamePod-link')
        hrefs = [link['href'] for link in game_links]
        game_ids = [href.split('/')[2] for href in hrefs]
    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        game_ids = []

    return game_ids

def collect_team_shots(game_ids):
    game_data = []

    for game_id in game_ids:
        data = None

        try:
            response = requests.get(f'https://data.ncaa.com/casablanca/game/{game_id}/boxscore.json')
            response.raise_for_status()
            data = response.json()
        except requests.exceptions.RequestException as e:
            print(f"Error fetching data for game ID {game_id}: {e}")
            try:
                result = os.popen(f'curl https://data.ncaa.com/casablanca/game/{game_id}/boxscore.json').read()
                data = json.loads(result)
            except Exception as e:
                print(f"Error fetching data for game ID {game_id} using curl: {e}")
                continue

        if data is None or 'meta' not in data:
            print(f"Error: 'meta' key not found in data for game ID {game_id}")
            continue

        home_team_data = data['meta']['teams'][0]
        away_team_data = data['meta']['teams'][1]

        home_id = str(home_team_data['id'])
        away_id = str(away_team_data['id'])
        home_name = home_team_data['shortName']
        away_name = away_team_data['shortName']

        home_shots = 0
        away_shots = 0

        for team in data['teams']:
            team_id = str(team['teamId'])
            total_shots = int(team['playerTotals']['shots'])

            if team_id == home_id:
                home_shots = total_shots
            elif team_id == away_id:
                away_shots = total_shots

        game_data.append({
            'team': home_name,
            'shots_for': home_shots,
            'shots_against': away_shots
        })

        game_data.append({
            'team': away_name,
            'shots_for': away_shots,
            'shots_against': home_shots
        })

    return game_data


dfs = []
division = 'd1'
for day in time_range:
    game_ids = get_game_ids(day, division)
    game_shots_data = collect_team_shots(game_ids)
    date = "_".join(day.split('/'))

    df = pd.DataFrame(game_shots_data)
    dfs.append(df)
    print(day, "Done!")

final_df = pd.concat(dfs, ignore_index=True)

08/22 Done!
08/23 Done!
08/24 Done!
08/25 Done!
08/26 Done!
08/27 Done!
08/28 Done!
Error fetching data for game ID 6309785: 404 Client Error: Not Found for url: https://data.ncaa.com/casablanca/game/6309785/boxscore.json
Error: 'meta' key not found in data for game ID 6309785
08/29 Done!
08/30 Done!
08/31 Done!
09/01 Done!
09/02 Done!
09/03 Done!
09/04 Done!
09/05 Done!
09/06 Done!
09/07 Done!
09/08 Done!
09/09 Done!
09/10 Done!
09/11 Done!
09/12 Done!
09/13 Done!
09/14 Done!
09/15 Done!
09/16 Done!
09/17 Done!
09/18 Done!
09/19 Done!
09/20 Done!
09/21 Done!
09/22 Done!
09/23 Done!
09/24 Done!
09/25 Done!
Error fetching data for game ID 6310401: 404 Client Error: Not Found for url: https://data.ncaa.com/casablanca/game/6310401/boxscore.json
Error: 'meta' key not found in data for game ID 6310401
09/26 Done!
09/27 Done!
09/28 Done!
09/29 Done!
Error fetching data for game ID 6310430: 404 Client Error: Not Found for url: https://data.ncaa.com/casablanca/game/6310430/boxscore.json
Error:

In [9]:
results.to_csv(rf'd2_ncaa_mens_scores_2024.csv', index=False)

In [None]:
teams = results['home_team'].unique()

In [None]:
conferences

In [None]:
import pandas as pd

# Initialize standings
standings = pd.DataFrame(conferences, columns=['Team', 'Conference'])
standings['GP'] = 0
standings['Wins'] = 0
standings['Draws'] = 0
standings['Losses'] = 0
standings['Points'] = 0
standings['GF'] = 0  # Goals For
standings['GA'] = 0  # Goals Against

# Process results
for _, game in results.iterrows():
    home_team = game['home_team']
    away_team = game['away_team']
    home_score = game['home_team_score']
    away_score = game['away_team_score']
    
    # Update goals for and against
    standings.loc[standings['Team'] == home_team, 'GF'] += home_score
    standings.loc[standings['Team'] == away_team, 'GF'] += away_score
    standings.loc[standings['Team'] == home_team, 'GA'] += away_score
    standings.loc[standings['Team'] == away_team, 'GA'] += home_score
    
    # Update match results
    if home_score > away_score:
        standings.loc[standings['Team'] == home_team, 'Wins'] += 1
        standings.loc[standings['Team'] == home_team, 'Points'] += 3
        standings.loc[standings['Team'] == away_team, 'Losses'] += 1
    elif home_score < away_score:
        standings.loc[standings['Team'] == away_team, 'Wins'] += 1
        standings.loc[standings['Team'] == away_team, 'Points'] += 3
        standings.loc[standings['Team'] == home_team, 'Losses'] += 1
    else:
        standings.loc[standings['Team'] == home_team, 'Draws'] += 1
        standings.loc[standings['Team'] == away_team, 'Draws'] += 1
        standings.loc[standings['Team'] == home_team, 'Points'] += 1
        standings.loc[standings['Team'] == away_team, 'Points'] += 1

# Calculate games played
standings['GP'] = standings[['Wins', 'Draws', 'Losses']].sum(axis=1)

In [None]:
final_grouped_df = final_df.groupby('team', as_index=False).sum()

In [None]:
final_grouped_df = final_grouped_df[final_grouped_df['team'].isin(teams)]
final_grouped_df

In [None]:
final_grouped_df.head(10)

In [None]:
final_grouped_df.rename(columns={'team': 'Team'}, inplace=True)
standings = standings.merge(final_grouped_df[['Team', 'shots_for', 'shots_against']], left_on='Team', right_on='Team', how='left')
standings.rename(columns={'shots_for': 'Shots', 'shots_against': 'Shots Against'}, inplace=True)

In [None]:
standings['TSR'] = standings['Shots'] / (standings['Shots'] + standings['Shots Against']) 

In [None]:
standings.sort_values(by='ATT', ascending=False).head(5)

In [None]:
standings['ATT'] = ((standings['GF'] / standings['GP'])/(standings['GF'] / standings['GP']).mean())

In [None]:
standings['DEF'] = ((standings['GA'] / standings['GP'])/(standings['GA'] / standings['GP']).mean())