In [5]:
# Provided team stats for all Sweet Sixteen teams
team_stats = {
    'North Carolina Tar Heels': {'scoring_defense': 68.5, 'scoring_offense': 82.0, 'assists': 500, 'fg_pct': 0.472, 'three_pt_pct': None, 'rebounds': 39.2, 'turnovers': 11.1, 'seed': 8, 'offensive_efficiency': 1.117},
    'Iowa State Cyclones': {'scoring_defense': 61.2, 'scoring_offense': 75.0, 'assists': 400, 'fg_pct': 0.461, 'three_pt_pct': None, 'rebounds': 36.8, 'turnovers': 10.8, 'seed': 6, 'offensive_efficiency': 1.095},
    'NC State Wolfpack': {'scoring_defense': 70.0, 'scoring_offense': 77.0, 'assists': 450, 'fg_pct': 0.455, 'three_pt_pct': 0.340, 'rebounds': 37.5, 'turnovers': 12.0, 'seed': 11, 'offensive_efficiency': 1.078},
    'Gonzaga Bulldogs': {'scoring_defense': 69.0, 'scoring_offense': 85.0, 'assists': 571, 'fg_pct': 0.490, 'three_pt_pct': 0.375, 'rebounds': 38.0, 'turnovers': 10.5, 'seed': 3, 'offensive_efficiency': 1.181},
    'Arizona Wildcats': {'scoring_defense': 64.9, 'scoring_offense': 87.6, 'assists': 651, 'fg_pct': 0.549, 'three_pt_pct': 0.373, 'rebounds': None, 'turnovers': None, 'seed': 2, 'offensive_efficiency': 1.146},
    'Illinois Fighting Illini': {'scoring_defense': 65.9, 'scoring_offense': 84.6, 'assists': 450, 'fg_pct': None, 'three_pt_pct': None, 'rebounds': None, 'turnovers': None, 'seed': 4, 'offensive_efficiency': 1.170},
    'Tennessee Volunteers': {'scoring_defense': 67.0, 'scoring_offense': 79.1, 'assists': 567, 'fg_pct': None, 'three_pt_pct': None, 'rebounds': None, 'turnovers': None, 'seed': 9, 'offensive_efficiency': 1.102},
    'Purdue Boilermakers': {'scoring_defense': 67.1, 'scoring_offense': 83.9, 'assists': 659, 'fg_pct': 0.562, 'three_pt_pct': 0.392, 'rebounds': 2054.0, 'turnovers': None, 'seed': 5, 'offensive_efficiency': 1.185},
    'Marquette Golden Eagles': {'scoring_defense': 68.0, 'scoring_offense': 80.0, 'assists': 450, 'fg_pct': 0.555, 'three_pt_pct': None, 'rebounds': 2196.0, 'turnovers': None, 'seed': 19, 'offensive_efficiency': 1.106},
    'Creighton Bluejays': {'scoring_defense': 64.0, 'scoring_offense': 80.6, 'assists': 575, 'fg_pct': 0.574, 'three_pt_pct': 0.365, 'rebounds': 2043.0, 'turnovers': None, 'seed': 6, 'offensive_efficiency': 1.142},
    'Duke Blue Devils': {'scoring_defense': 66.5, 'scoring_offense': 79.8, 'assists': 526, 'fg_pct': None, 'three_pt_pct': 0.380, 'rebounds': 2004.0, 'turnovers': None, 'seed': 22, 'offensive_efficiency': 1.159},
    'Clemson Tigers': {'scoring_defense': 65.9, 'scoring_offense': 74.0, 'assists': 450, 'fg_pct': None, 'three_pt_pct': None, 'rebounds': None, 'turnovers': None, 'seed': None, 'offensive_efficiency': 1.111},
    'Alabama Crimson Tide': {'scoring_defense': 68.0, 'scoring_offense': 82.0, 'assists': 545, 'fg_pct': 0.563, 'three_pt_pct': 0.366, 'rebounds': 2198.0, 'turnovers': None, 'seed': 12, 'offensive_efficiency': 1.183},
    'San Diego State Aztecs': {'scoring_defense': 66.2, 'scoring_offense': 75.0, 'assists': 450, 'fg_pct': None, 'three_pt_pct': None, 'rebounds': None, 'turnovers': None, 'seed': None, 'offensive_efficiency': None},
    'Houston Cougars': {'scoring_defense': 57.7, 'scoring_offense': 90.7, 'assists': 573, 'fg_pct': None, 'three_pt_pct': None, 'rebounds': None, 'turnovers': None, 'seed': None, 'offensive_efficiency': 1.114},
    'University of Connecticut Huskies': {'scoring_defense': 63.9, 'scoring_offense': 81.6, 'assists': 672, 'fg_pct': 0.572, 'three_pt_pct': None, 'rebounds': 2118.0, 'turnovers': None, 'seed': 36, 'offensive_efficiency': 1.198},
}

In [50]:
sweet_16_matchups = [
    ('University of Connecticut Huskies', 'San Diego State Aztecs'),
    ('Iowa State Cyclones', 'Illinois Fighting Illini'),
    ('North Carolina Tar Heels', 'Alabama Crimson Tide'),
    ('Arizona Wildcats', 'Clemson Tigers'),
    ('Houston Cougars', 'Duke Blue Devils'),
    ('Marquette Golden Eagles', 'NC State Wolfpack'),
    ('Purdue Boilermakers', 'Gonzaga Bulldogs'),
    ('Tennessee Volunteers', 'Creighton Bluejays'),
]

In [16]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_team_stats(url, stat_name):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    table = soup.find('table', class_='tr-table datatable scrollable')
    rows = table.find_all('tr')[1:]  # Exclude the header row

    team_stats = {}
    for row in rows:
        cells = row.find_all('td')
        team_name = cells[1].text.strip()
        stat_value = cells[2].text.strip()
        if stat_name in ['TRB%', 'OFF EFF']:
            stat_value = float(stat_value.strip('%')) / 100
        else:
            stat_value = float(stat_value)
        team_stats[team_name] = stat_value

    return team_stats

def get_sweet_sixteen_stats():
    # URLs for the desired stats
    three_pointers_url = 'https://www.teamrankings.com/ncaa-basketball/stat/three-pointers-made-per-game'
    assist_turnover_ratio_url = 'https://www.teamrankings.com/ncaa-basketball/stat/assist--per--turnover-ratio'
    total_rebounding_percentage_url = 'https://www.teamrankings.com/ncaa-basketball/stat/total-rebounding-percentage'
    defensive_efficiency_url = 'https://www.teamrankings.com/ncaa-basketball/stat/defensive-efficiency'
    offensive_efficiency_url = 'https://www.teamrankings.com/ncaa-basketball/stat/offensive-efficiency'

    # Scrape the stats for each URL
    three_pointers = scrape_team_stats(three_pointers_url, '3PT')
    assist_turnover_ratio = scrape_team_stats(assist_turnover_ratio_url, 'AST/TO')
    total_rebounding_percentage = scrape_team_stats(total_rebounding_percentage_url, 'TRB%')
    defensive_efficiency = scrape_team_stats(defensive_efficiency_url, 'DEF EFF')
    offensive_efficiency = scrape_team_stats(offensive_efficiency_url, 'OFF EFF')

    # Create a DataFrame with the scraped stats
    sweet_sixteen_df = pd.DataFrame.from_dict(three_pointers, orient='index', columns=['3PT'])
    sweet_sixteen_df['AST/TO'] = sweet_sixteen_df.index.map(assist_turnover_ratio)
    sweet_sixteen_df['TRB%'] = sweet_sixteen_df.index.map(total_rebounding_percentage)
    sweet_sixteen_df['DEF EFF'] = sweet_sixteen_df.index.map(defensive_efficiency)
    sweet_sixteen_df['OFF EFF'] = sweet_sixteen_df.index.map(offensive_efficiency)

    # Filter the DataFrame to include only the Sweet Sixteen teams
    sweet_sixteen_teams = [
        'Alabama', 'Arizona', 'Clemson', 'Creighton', 'Duke',
        'Gonzaga', 'Houston', 'Illinois', 'Iowa St', 'Marquette',
        'NC State', 'N Carolina', 'Purdue', 'Tennessee',
        'San Diego St', 'Connecticut'
    ]
    sweet_sixteen_df = sweet_sixteen_df.loc[sweet_sixteen_df.index.isin(sweet_sixteen_teams)]

    # Add missing teams to the DataFrame
    missing_teams = set(sweet_sixteen_teams) - set(sweet_sixteen_df.index)
    for team in missing_teams:
        sweet_sixteen_df.loc[team] = [None] * len(sweet_sixteen_df.columns)

    return sweet_sixteen_df

# Get the Sweet Sixteen stats DataFrame
sweet_sixteen_stats_df = get_sweet_sixteen_stats()

# Create a new dictionary with the scraped stats
team_stats = {}
for index, row in sweet_sixteen_stats_df.iterrows():
    team_name = index
    team_stats[team_name] = {
        '3PT': row['3PT'],
        'AST/TO': row['AST/TO'],
        'TRB%': row['TRB%'],
        'DEF EFF': row['DEF EFF'],
        'OFF EFF': row['OFF EFF']
    }

# Print the team_stats dictionary
for team, stats in team_stats.items():
    print(f"{team}: {stats}")

Alabama: {'3PT': 11.0, 'AST/TO': 1.342, 'TRB%': 0.528, 'DEF EFF': 1.054, 'OFF EFF': 0.01183}
Creighton: {'3PT': 10.7, 'AST/TO': 1.584, 'TRB%': 0.5329999999999999, 'DEF EFF': 0.985, 'OFF EFF': 0.01142}
Marquette: {'3PT': 9.0, 'AST/TO': 1.608, 'TRB%': 0.478, 'DEF EFF': 0.984, 'OFF EFF': 0.01106}
Connecticut: {'3PT': 8.6, 'AST/TO': 1.915, 'TRB%': 0.562, 'DEF EFF': 0.938, 'OFF EFF': 0.01198}
Tennessee: {'3PT': 8.6, 'AST/TO': 1.588, 'TRB%': 0.529, 'DEF EFF': 0.934, 'OFF EFF': 0.01102}
Duke: {'3PT': 8.4, 'AST/TO': 1.614, 'TRB%': 0.5379999999999999, 'DEF EFF': 0.966, 'OFF EFF': 0.01159}
Purdue: {'3PT': 8.4, 'AST/TO': 1.656, 'TRB%': 0.5820000000000001, 'DEF EFF': 0.981, 'OFF EFF': 0.011850000000000001}
Illinois: {'3PT': 8.4, 'AST/TO': 1.264, 'TRB%': 0.557, 'DEF EFF': 1.015, 'OFF EFF': 0.011699999999999999}
Clemson: {'3PT': 8.2, 'AST/TO': 1.452, 'TRB%': 0.526, 'DEF EFF': 1.016, 'OFF EFF': 0.01111}
N Carolina: {'3PT': 8.0, 'AST/TO': 1.422, 'TRB%': 0.55, 'DEF EFF': 0.956, 'OFF EFF': 0.01117}
Ariz

In [19]:
# explore the data
print(sweet_sixteen_stats_df.head(16))
print(sweet_sixteen_stats_df.info())
print(sweet_sixteen_stats_df.describe())
print(sweet_sixteen_stats_df.isnull().sum())
print(sweet_sixteen_stats_df['3PT'].value_counts())
print(sweet_sixteen_stats_df['AST/TO'].value_counts())
print(sweet_sixteen_stats_df['TRB%'].value_counts())
print(sweet_sixteen_stats_df['DEF EFF'].value_counts())
print(sweet_sixteen_stats_df['OFF EFF'].value_counts())
print(sweet_sixteen_stats_df['3PT'].mean())
print(sweet_sixteen_stats_df['AST/TO'].mean())
print(sweet_sixteen_stats_df['TRB%'].mean())
print(sweet_sixteen_stats_df['DEF EFF'].mean())
print(sweet_sixteen_stats_df['OFF EFF'].mean())
print(sweet_sixteen_stats_df['3PT'].median())



    



               3PT  AST/TO   TRB%  DEF EFF  OFF EFF
Alabama       11.0   1.342  0.528    1.054  0.01183
Creighton     10.7   1.584  0.533    0.985  0.01142
Marquette      9.0   1.608  0.478    0.984  0.01106
Connecticut    8.6   1.915  0.562    0.938  0.01198
Tennessee      8.6   1.588  0.529    0.934  0.01102
Duke           8.4   1.614  0.538    0.966  0.01159
Purdue         8.4   1.656  0.582    0.981  0.01185
Illinois       8.4   1.264  0.557    1.015  0.01170
Clemson        8.2   1.452  0.526    1.016  0.01111
N Carolina     8.0   1.422  0.550    0.956  0.01117
Arizona        8.0   1.546  0.567    0.941  0.01146
Houston        7.9   1.481  0.525    0.867  0.01114
NC State       7.1   1.370  0.495    1.022  0.01078
San Diego St   7.0   1.184  0.528    0.959  0.01065
Gonzaga        7.0   1.686  0.547    0.974  0.01181
Iowa St        6.6   1.523  0.505    0.888  0.01095
<class 'pandas.core.frame.DataFrame'>
Index: 16 entries, Alabama to Iowa St
Data columns (total 5 columns):
 #   Col

In [21]:
# Clean data
sweet_sixteen_stats_df['3PT'].fillna(sweet_sixteen_stats_df['3PT'].mean(), inplace=True)
sweet_sixteen_stats_df['AST/TO'].fillna(sweet_sixteen_stats_df['AST/TO'].mean(), inplace=True)
sweet_sixteen_stats_df['TRB%'].fillna(sweet_sixteen_stats_df['TRB%'].mean(), inplace=True)
sweet_sixteen_stats_df['DEF EFF'].fillna(sweet_sixteen_stats_df['DEF EFF'].mean(), inplace=True)
sweet_sixteen_stats_df['OFF EFF'].fillna(sweet_sixteen_stats_df['OFF EFF'].mean(), inplace=True)

# Print the cleaned DataFrame
print(sweet_sixteen_stats_df.head(16))



               3PT  AST/TO   TRB%  DEF EFF  OFF EFF
Alabama       11.0   1.342  0.528    1.054  0.01183
Creighton     10.7   1.584  0.533    0.985  0.01142
Marquette      9.0   1.608  0.478    0.984  0.01106
Connecticut    8.6   1.915  0.562    0.938  0.01198
Tennessee      8.6   1.588  0.529    0.934  0.01102
Duke           8.4   1.614  0.538    0.966  0.01159
Purdue         8.4   1.656  0.582    0.981  0.01185
Illinois       8.4   1.264  0.557    1.015  0.01170
Clemson        8.2   1.452  0.526    1.016  0.01111
N Carolina     8.0   1.422  0.550    0.956  0.01117
Arizona        8.0   1.546  0.567    0.941  0.01146
Houston        7.9   1.481  0.525    0.867  0.01114
NC State       7.1   1.370  0.495    1.022  0.01078
San Diego St   7.0   1.184  0.528    0.959  0.01065
Gonzaga        7.0   1.686  0.547    0.974  0.01181
Iowa St        6.6   1.523  0.505    0.888  0.01095


In [24]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_team_stats(url, stat_name):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    table = soup.find('table', class_='tr-table datatable scrollable')
    rows = table.find_all('tr')[1:]  # Exclude the header row

    team_stats = {}
    for row in rows:
        cells = row.find_all('td')
        team_name = cells[1].text.strip()
        stat_value = cells[2].text.strip()
        if stat_name in ['TRB%', 'OFF EFF']:
            stat_value = float(stat_value.strip('%')) / 100
        else:
            stat_value = float(stat_value)
        team_stats[team_name] = stat_value

    return team_stats

def get_sweet_sixteen_stats():
    # URLs for the desired stats
    three_pointers_url = 'https://www.teamrankings.com/ncaa-basketball/stat/three-pointers-made-per-game'
    assist_turnover_ratio_url = 'https://www.teamrankings.com/ncaa-basketball/stat/assist--per--turnover-ratio'
    total_rebounding_percentage_url = 'https://www.teamrankings.com/ncaa-basketball/stat/total-rebounding-percentage'
    defensive_efficiency_url = 'https://www.teamrankings.com/ncaa-basketball/stat/defensive-efficiency'
    offensive_efficiency_url = 'https://www.teamrankings.com/ncaa-basketball/stat/offensive-efficiency'

    # Scrape the stats for each URL
    three_pointers = scrape_team_stats(three_pointers_url, '3PT')
    assist_turnover_ratio = scrape_team_stats(assist_turnover_ratio_url, 'AST/TO')
    total_rebounding_percentage = scrape_team_stats(total_rebounding_percentage_url, 'TRB%')
    defensive_efficiency = scrape_team_stats(defensive_efficiency_url, 'DEF EFF')
    offensive_efficiency = scrape_team_stats(offensive_efficiency_url, 'OFF EFF')

    # Create a DataFrame with the scraped stats
    sweet_sixteen_df = pd.DataFrame.from_dict(three_pointers, orient='index', columns=['3PT'])
    sweet_sixteen_df['AST/TO'] = sweet_sixteen_df.index.map(assist_turnover_ratio)
    sweet_sixteen_df['TRB%'] = sweet_sixteen_df.index.map(total_rebounding_percentage)
    sweet_sixteen_df['DEF EFF'] = sweet_sixteen_df.index.map(defensive_efficiency)
    sweet_sixteen_df['OFF EFF'] = sweet_sixteen_df.index.map(offensive_efficiency)

    # Filter the DataFrame to include only the Sweet Sixteen teams
    sweet_sixteen_teams = [
        'Alabama', 'Arizona', 'Clemson', 'Creighton', 'Duke',
        'Gonzaga', 'Houston', 'Illinois', 'Iowa St', 'Marquette',
        'NC State', 'N Carolina', 'Purdue', 'Tennessee',
        'San Diego St', 'Connecticut'
    ]
    sweet_sixteen_df = sweet_sixteen_df.loc[sweet_sixteen_df.index.isin(sweet_sixteen_teams)]

    return sweet_sixteen_df

# Get the Sweet Sixteen stats DataFrame
sweet_sixteen_stats_df = get_sweet_sixteen_stats()

# Print the DataFrame
print(sweet_sixteen_stats_df)

# Print the DataFrame information
print(sweet_sixteen_stats_df.info())

# Generate descriptive statistics
print(sweet_sixteen_stats_df.describe())

# Count the occurrences of each value in each column
for column in sweet_sixteen_stats_df.columns:
    print(sweet_sixteen_stats_df[column].value_counts())

# Calculate the mean of each column
for column in sweet_sixteen_stats_df.columns:
    mean = sweet_sixteen_stats_df[column].mean()
    print(f"Mean of {column}: {mean}")

# Calculate the median of each column
for column in sweet_sixteen_stats_df.columns:
    median = sweet_sixteen_stats_df[column].median()
    print(f"Median of {column}: {median}")
    

               3PT  AST/TO   TRB%  DEF EFF  OFF EFF
Alabama       11.0   1.342  0.528    1.054  0.01183
Creighton     10.7   1.584  0.533    0.985  0.01142
Marquette      9.0   1.608  0.478    0.984  0.01106
Connecticut    8.6   1.915  0.562    0.938  0.01198
Tennessee      8.6   1.588  0.529    0.934  0.01102
Duke           8.4   1.614  0.538    0.966  0.01159
Purdue         8.4   1.656  0.582    0.981  0.01185
Illinois       8.4   1.264  0.557    1.015  0.01170
Clemson        8.2   1.452  0.526    1.016  0.01111
N Carolina     8.0   1.422  0.550    0.956  0.01117
Arizona        8.0   1.546  0.567    0.941  0.01146
Houston        7.9   1.481  0.525    0.867  0.01114
NC State       7.1   1.370  0.495    1.022  0.01078
San Diego St   7.0   1.184  0.528    0.959  0.01065
Gonzaga        7.0   1.686  0.547    0.974  0.01181
Iowa St        6.6   1.523  0.505    0.888  0.01095
<class 'pandas.core.frame.DataFrame'>
Index: 16 entries, Alabama to Iowa St
Data columns (total 5 columns):
 #   Col

In [25]:
#describe data
print(sweet_sixteen_stats_df.head(16))
print(sweet_sixteen_stats_df.info())
print(sweet_sixteen_stats_df.describe())
print(sweet_sixteen_stats_df.isnull().sum())
print(sweet_sixteen_stats_df['3PT'].value_counts())
print(sweet_sixteen_stats_df['AST/TO'].value_counts())
print(sweet_sixteen_stats_df['TRB%'].value_counts())
print(sweet_sixteen_stats_df['DEF EFF'].value_counts())
print(sweet_sixteen_stats_df['OFF EFF'].value_counts())
print(sweet_sixteen_stats_df['3PT'].mean())
print(sweet_sixteen_stats_df['AST/TO'].mean())
print(sweet_sixteen_stats_df['TRB%'].mean())
print(sweet_sixteen_stats_df['DEF EFF'].mean())
print(sweet_sixteen_stats_df['OFF EFF'].mean())
print(sweet_sixteen_stats_df['3PT'].median())


               3PT  AST/TO   TRB%  DEF EFF  OFF EFF
Alabama       11.0   1.342  0.528    1.054  0.01183
Creighton     10.7   1.584  0.533    0.985  0.01142
Marquette      9.0   1.608  0.478    0.984  0.01106
Connecticut    8.6   1.915  0.562    0.938  0.01198
Tennessee      8.6   1.588  0.529    0.934  0.01102
Duke           8.4   1.614  0.538    0.966  0.01159
Purdue         8.4   1.656  0.582    0.981  0.01185
Illinois       8.4   1.264  0.557    1.015  0.01170
Clemson        8.2   1.452  0.526    1.016  0.01111
N Carolina     8.0   1.422  0.550    0.956  0.01117
Arizona        8.0   1.546  0.567    0.941  0.01146
Houston        7.9   1.481  0.525    0.867  0.01114
NC State       7.1   1.370  0.495    1.022  0.01078
San Diego St   7.0   1.184  0.528    0.959  0.01065
Gonzaga        7.0   1.686  0.547    0.974  0.01181
Iowa St        6.6   1.523  0.505    0.888  0.01095
<class 'pandas.core.frame.DataFrame'>
Index: 16 entries, Alabama to Iowa St
Data columns (total 5 columns):
 #   Col

In [26]:
# upadte team stats df
sweet_sixteen_stats_df['3PT'].fillna(sweet_sixteen_stats_df['3PT'].mean(), inplace=True)
sweet_sixteen_stats_df['AST/TO'].fillna(sweet_sixteen_stats_df['AST/TO'].mean(), inplace=True)
sweet_sixteen_stats_df['TRB%'].fillna(sweet_sixteen_stats_df['TRB%'].mean(), inplace=True)
sweet_sixteen_stats_df['DEF EFF'].fillna(sweet_sixteen_stats_df['DEF EFF'].mean(), inplace=True)
sweet_sixteen_stats_df['OFF EFF'].fillna(sweet_sixteen_stats_df['OFF EFF'].mean(), inplace=True)

# Print the cleaned DataFrame
print(sweet_sixteen_stats_df.head(16))


               3PT  AST/TO   TRB%  DEF EFF  OFF EFF
Alabama       11.0   1.342  0.528    1.054  0.01183
Creighton     10.7   1.584  0.533    0.985  0.01142
Marquette      9.0   1.608  0.478    0.984  0.01106
Connecticut    8.6   1.915  0.562    0.938  0.01198
Tennessee      8.6   1.588  0.529    0.934  0.01102
Duke           8.4   1.614  0.538    0.966  0.01159
Purdue         8.4   1.656  0.582    0.981  0.01185
Illinois       8.4   1.264  0.557    1.015  0.01170
Clemson        8.2   1.452  0.526    1.016  0.01111
N Carolina     8.0   1.422  0.550    0.956  0.01117
Arizona        8.0   1.546  0.567    0.941  0.01146
Houston        7.9   1.481  0.525    0.867  0.01114
NC State       7.1   1.370  0.495    1.022  0.01078
San Diego St   7.0   1.184  0.528    0.959  0.01065
Gonzaga        7.0   1.686  0.547    0.974  0.01181
Iowa St        6.6   1.523  0.505    0.888  0.01095


In [59]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import numpy as np

from sklearn.preprocessing import LabelEncoder

# Assuming 'team_stats_df' is your DataFrame containing the team statistics
team_stats_df = pd.DataFrame({
    'Team': ['Alabama', 'Creighton', 'Marquette', 'Connecticut', 'Tennessee', 'Duke', 'Purdue', 'Illinois', 'Clemson', 'N Carolina', 'Arizona', 'Houston', 'NC State', 'San Diego St', 'Gonzaga', 'Iowa St'],
    '3PT': [11.0, 10.7, 9.0, 8.6, 8.6, 8.4, 8.4, 8.4, 8.2, 8.0, 8.0, 7.9, 7.1, 7.0, 7.0, 6.6],
    'AST/TO': [1.342, 1.584, 1.608, 1.915, 1.588, 1.614, 1.656, 1.264, 1.452, 1.422, 1.546, 1.481, 1.370, 1.184, 1.686, 1.523],
    'TRB%': [0.528, 0.533, 0.478, 0.562, 0.529, 0.538, 0.582, 0.557, 0.526, 0.550, 0.567, 0.525, 0.495, 0.528, 0.547, 0.505],
    'DEF EFF': [1.054, 0.985, 0.984, 0.938, 0.934, 0.966, 0.981, 1.015, 1.016, 0.956, 0.941, 0.867, 1.022, 0.959, 0.974, 0.888],
    'OFF EFF': [0.01183, 0.01142, 0.01106, 0.01198, 0.01102, 0.01159, 0.01185, 0.01170, 0.01111, 0.01117, 0.01146, 0.01114, 0.01078, 0.01065, 0.01181, 0.01095]
})

# Encoding Team Names
le = LabelEncoder()
team_stats_df['Team'] = le.fit_transform(team_stats_df['Team'])

# Preparing the dataset
# Normally, you would create features and labels here, but since we don't have matchup outcomes,
# let's assume a scenario where we randomly assign winners just for the sake of demonstration.
np.random.seed(42)  # For reproducibility
team_stats_df['Winner'] = np.random.randint(2, size=len(team_stats_df))

X = team_stats_df.drop('Winner', axis=1)
y = team_stats_df['Winner']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
clf.fit(X_train, y_train)

# Predict on the testing set
y_pred = clf.predict(X_test)

# Evaluate the classifier
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

# Simulate a game (Here you'd replace this logic with your actual simulation based on model predictions)
def simulate_game(team1, team2, model=clf):
    # Extract team stats
    team1_stats = team_stats_df.loc[team_stats_df['Team'] == le.transform([team1])[0]].drop('Winner', axis=1)
    team2_stats = team_stats_df.loc[team_stats_df['Team'] == le.transform([team2])[0]].drop('Winner', axis=1)
    
    # Here you would combine team1_stats and team2_stats in the way your model expects



Accuracy: 0.25


In [60]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

# Feature Engineering: Creating differential features
team_stats_df['3PT_diff'] = team_stats_df['3PT'] - team_stats_df['3PT'].mean()
team_stats_df['AST/TO_diff'] = team_stats_df['AST/TO'] - team_stats_df['AST/TO'].mean()
# Continue for other features...

X = team_stats_df[['3PT_diff', 'AST/TO_diff']]  # Example feature set
y = team_stats_df['Winner']

# Try a simpler model: Logistic Regression
simple_model = LogisticRegression(random_state=42)

# Using cross-validation to evaluate the model
cv_scores = cross_val_score(simple_model, X, y, cv=5)  # 5-fold cross-validation

print(f"CV Accuracy: {np.mean(cv_scores):.2f} +/- {np.std(cv_scores):.2f}")


CV Accuracy: 0.75 +/- 0.13




In [61]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(clf, param_grid, cv=stratified_cv, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best CV accuracy:", grid_search.best_score_)


NameError: name 'stratified_cv' is not defined

In [62]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV

# Assuming 'clf' is your initialized RandomForestClassifier and 'X_train', 'y_train' are your training data and labels

# Define stratified cross-validation
stratified_cv = StratifiedKFold(n_splits=5)

# Define the parameter grid to search over
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Initialize GridSearchCV with the classifier, parameter grid, and stratified CV
grid_search = GridSearchCV(clf, param_grid, cv=stratified_cv, scoring='accuracy')

# Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters:", grid_search.best_params_)
print("Best CV accuracy:", grid_search.best_score_)




Best parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Best CV accuracy: 0.9333333333333332


In [67]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Assuming X and y are your features and labels respectively
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE
smote = SMOTE(random_state=42, k_neighbors=1)  # Only if you have at least 2 samples in the minority class
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Now, y_train_smote has a balanced class distribution
print(f"Before SMOTE, counts of label '1': {sum(y_train == 1)}")
print(f"Before SMOTE, counts of label '0': {sum(y_train == 0)}\n")

print(f"After SMOTE, counts of label '1': {sum(y_train_smote == 1)}")
print(f"After SMOTE, counts of label '0': {sum(y_train_smote == 0)}")

# Train RandomForestClassifier
clf = RandomForestClassifier(n_estimators=50, class_weight='balanced', random_state=42)
clf.fit(X_train_smote, y_train_smote)

# Predict on the testing set
y_pred = clf.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


ValueError: Expected n_neighbors <= n_samples_fit, but n_neighbors = 2, n_samples_fit = 1, n_samples = 1

In [64]:
from sklearn.ensemble import RandomForestClassifier

# Assuming 'X_train', 'y_train' are your training data and labels
# Initialize the RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
clf.fit(X_train, y_train)

# Predict on the testing set
y_pred = clf.predict(X_test)

# Evaluate the classifier
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")




Accuracy: 0.25


In [29]:
import random
from collections import Counter
import math

# Updated team stats with the new scraped data
team_stats = {
    'North Carolina Tar Heels': {'scoring_defense': 68.5, 'scoring_offense': 82.0, 'assists': 500, 'fg_pct': 0.472, 'three_pt_pct': None, 'rebounds': 39.2, 'turnovers': 11.1, 'seed': 8, 'offensive_efficiency': 1.117, '3PT': 8.0, 'AST/TO': 1.422, 'TRB%': 0.550, 'DEF EFF': 0.956, 'OFF EFF': 0.01117},
    'Iowa State Cyclones': {'scoring_defense': 61.2, 'scoring_offense': 75.0, 'assists': 400, 'fg_pct': 0.461, 'three_pt_pct': None, 'rebounds': 36.8, 'turnovers': 10.8, 'seed': 6, 'offensive_efficiency': 1.095, '3PT': 6.6, 'AST/TO': 1.523, 'TRB%': 0.505, 'DEF EFF': 0.888, 'OFF EFF': 0.01095},
    'NC State Wolfpack': {'scoring_defense': 70.0, 'scoring_offense': 77.0, 'assists': 450, 'fg_pct': 0.455, 'three_pt_pct': 0.340, 'rebounds': 37.5, 'turnovers': 12.0, 'seed': 11, 'offensive_efficiency': 1.078, '3PT': 7.1, 'AST/TO': 1.370, 'TRB%': 0.495, 'DEF EFF': 1.022, 'OFF EFF': 0.01078},
    'Gonzaga Bulldogs': {'scoring_defense': 69.0, 'scoring_offense': 85.0, 'assists': 571, 'fg_pct': 0.490, 'three_pt_pct': 0.375, 'rebounds': 38.0, 'turnovers': 10.5, 'seed': 3, 'offensive_efficiency': 1.181, '3PT': 7.0, 'AST/TO': 1.686, 'TRB%': 0.547, 'DEF EFF': 0.974, 'OFF EFF': 0.01181},
    'Arizona Wildcats': {'scoring_defense': 64.9, 'scoring_offense': 87.6, 'assists': 651, 'fg_pct': 0.549, 'three_pt_pct': 0.373, 'rebounds': None, 'turnovers': None, 'seed': 2, 'offensive_efficiency': 1.146, '3PT': 8.0, 'AST/TO': 1.546, 'TRB%': 0.567, 'DEF EFF': 0.941, 'OFF EFF': 0.01146},
    'Illinois Fighting Illini': {'scoring_defense': 65.9, 'scoring_offense': 84.6, 'assists': 450, 'fg_pct': None, 'three_pt_pct': None, 'rebounds': None, 'turnovers': None, 'seed': 4, 'offensive_efficiency': 1.170, '3PT': 8.4, 'AST/TO': 1.264, 'TRB%': 0.557, 'DEF EFF': 1.015, 'OFF EFF': 0.01170},
    'Tennessee Volunteers': {'scoring_defense': 67.0, 'scoring_offense': 79.1, 'assists': 567, 'fg_pct': None, 'three_pt_pct': None, 'rebounds': None, 'turnovers': None, 'seed': 9, 'offensive_efficiency': 1.102, '3PT': 8.6, 'AST/TO': 1.588, 'TRB%': 0.529, 'DEF EFF': 0.934, 'OFF EFF': 0.01102},
    'Purdue Boilermakers': {'scoring_defense': 67.1, 'scoring_offense': 83.9, 'assists': 659, 'fg_pct': 0.562, 'three_pt_pct': 0.392, 'rebounds': 2054.0, 'turnovers': None, 'seed': 5, 'offensive_efficiency': 1.185, '3PT': 8.4, 'AST/TO': 1.656, 'TRB%': 0.582, 'DEF EFF': 0.981, 'OFF EFF': 0.01185},
    'Marquette Golden Eagles': {'scoring_defense': 68.0, 'scoring_offense': 80.0, 'assists': 450, 'fg_pct': 0.555, 'three_pt_pct': None, 'rebounds': 2196.0, 'turnovers': None, 'seed': 19, 'offensive_efficiency': 1.106, '3PT': 9.0, 'AST/TO': 1.608, 'TRB%': 0.478, 'DEF EFF': 0.984, 'OFF EFF': 0.01106},
    'Creighton Bluejays': {'scoring_defense': 64.0, 'scoring_offense': 80.6, 'assists': 575, 'fg_pct': 0.574, 'three_pt_pct': 0.365, 'rebounds': 2043.0, 'turnovers': None, 'seed': 6, 'offensive_efficiency': 1.142, '3PT': 10.7, 'AST/TO': 1.584, 'TRB%': 0.533, 'DEF EFF': 0.985, 'OFF EFF': 0.01142},
    'Duke Blue Devils': {'scoring_defense': 66.5, 'scoring_offense': 79.8, 'assists': 526, 'fg_pct': None, 'three_pt_pct': 0.380, 'rebounds': 2004.0, 'turnovers': None, 'seed': 22, 'offensive_efficiency': 1.159, '3PT': 8.4, 'AST/TO': 1.614, 'TRB%': 0.538, 'DEF EFF': 0.966, 'OFF EFF': 0.01159},
    'Clemson Tigers': {'scoring_defense': 65.9, 'scoring_offense': 74.0, 'assists': 450, 'fg_pct': None, 'three_pt_pct': None, 'rebounds': None, 'turnovers': None, 'seed': None, 'offensive_efficiency': 1.111, '3PT': 8.2, 'AST/TO': 1.452, 'TRB%': 0.526, 'DEF EFF': 1.016, 'OFF EFF': 0.01111},
    'Alabama Crimson Tide': {'scoring_defense': 68.0, 'scoring_offense': 82.0, 'assists': 545, 'fg_pct': 0.563, 'three_pt_pct': 0.366, 'rebounds': 2198.0, 'turnovers': None, 'seed': 12, 'offensive_efficiency': 1.183, '3PT': 11.0, 'AST/TO': 1.342, 'TRB%': 0.528, 'DEF EFF': 1.054, 'OFF EFF': 0.01183},
    'San Diego State Aztecs': {'scoring_defense': 66.2, 'scoring_offense': 75.0, 'assists': 450, 'fg_pct': None, 'three_pt_pct': None, 'rebounds': None, 'turnovers': None, 'seed': None, 'offensive_efficiency': None, '3PT': 7.0, 'AST/TO': 1.184, 'TRB%': 0.528, 'DEF EFF': 0.959, 'OFF EFF': 0.01065},
    'Houston Cougars': {'scoring_defense': 57.7, 'scoring_offense': 90.7, 'assists': 573, 'fg_pct': None, 'three_pt_pct': None, 'rebounds': None, 'turnovers': None, 'seed': None, 'offensive_efficiency': 1.114, '3PT': 7.9, 'AST/TO': 1.481, 'TRB%': 0.525, 'DEF EFF': 0.867, 'OFF EFF': 0.01114},
    'University of Connecticut Huskies': {'scoring_defense': 63.9, 'scoring_offense': 81.6, 'assists': 672, 'fg_pct': 0.572, 'three_pt_pct': None, 'rebounds': 2118.0, 'turnovers': None, 'seed': 36, 'offensive_efficiency': 1.198, '3PT': 8.6, 'AST/TO': 1.915, 'TRB%': 0.562, 'DEF EFF': 0.938, 'OFF EFF': 0.01198},
}

# Defined Sweet Sixteen matchups based on actual tournament pairings
sweet_16_matchups = [
    ('University of Connecticut Huskies', 'San Diego State Aztecs'),
    ('Iowa State Cyclones', 'Illinois Fighting Illini'),
    ('North Carolina Tar Heels', 'Alabama Crimson Tide'),
    ('Arizona Wildcats', 'Clemson Tigers'),
    ('Houston Cougars', 'Duke Blue Devils'),
    ('Marquette Golden Eagles', 'NC State Wolfpack'),
    ('Purdue Boilermakers', 'Gonzaga Bulldogs'),
    ('Tennessee Volunteers', 'Creighton Bluejays'),
]

# Adding a momentum factor to the team_stats dictionary based on recent performance
for team in team_stats:
    recent_wins = random.randint(4, 8)  # Simulating recent wins
    recent_losses = random.randint(1, 4)  # Simulating recent losses
    momentum = (recent_wins - recent_losses) / (recent_wins + recent_losses)
    team_stats[team]['momentum'] = max(0.9, min(1.1, 1 + momentum))  # Keeping momentum between 0.9 and 1.1

def calculate_strength(team_name, round_factor=1.0):
    team = team_stats[team_name]
    off_def_balance = (team['scoring_offense'] + team['assists'] / 100) - team['scoring_defense']
    shooting_efficiency = (team.get('fg_pct', 0) + team.get('three_pt_pct', 0)) / 2 if team.get('fg_pct') and team.get('three_pt_pct') else 0
    rebounding_factor = team.get('TRB%', 0)
    turnover_factor = team.get('AST/TO', 0)
    three_pt_factor = team.get('3PT', 0) / 15  # Adjust the scaling factor as needed
    def_eff_factor = team.get('DEF EFF', 0)
    off_eff_factor = team.get('OFF EFF', 0) * 15  # Increase the weight of offensive efficiency further
    seed_factor = (17 - team['seed']) / 16 if team.get('seed') else 0
    offensive_efficiency_factor = team['offensive_efficiency'] / 1.5 if team.get('offensive_efficiency') else 0

    x_factor = random.uniform(-round_factor, round_factor)  # X-factor now depends on the round factor
    momentum = team['momentum']

    strength = off_def_balance * 0.25 + shooting_efficiency * 0.1 + rebounding_factor * 0.15 + turnover_factor * 0.15 + three_pt_factor * 0.05 + def_eff_factor * 0.1 + off_eff_factor * 0.15 + seed_factor * 0.05 + x_factor + momentum
    return strength

def simulate_game(team1, team2, round_factor=1.0):
    strength1 = calculate_strength(team1, round_factor)
    strength2 = calculate_strength(team2, round_factor)
    if strength1 > strength2:
        margin = strength1 - strength2
        team_stats[team1]['momentum'] += margin / 200
        return team1
    else:
        margin = strength2 - strength1
        team_stats[team2]['momentum'] += margin / 200
        return team2

def simulate_round(matchups, round_factor=1.0):
    winners = []
    for matchup in matchups:
        team1, team2 = matchup
        winner = simulate_game(team1, team2, round_factor=round_factor)
        winners.append(winner)
    return winners

def run_tournament_simulation(num_simulations=100000):
    final_four_counter = Counter()
    champion_counter = Counter()

    for _ in range(num_simulations):
        for team in team_stats:
            team_stats[team]['momentum'] = random.uniform(0.9, 1.1)

        # Simulating each round with increasing x-factor variability
        sweet_sixteen_winners = simulate_round(sweet_16_matchups, round_factor=0.8)
        elite_eight_winners = simulate_round([(sweet_sixteen_winners[i], sweet_sixteen_winners[i + 1]) for i in range(0, len(sweet_sixteen_winners), 2)], round_factor=1.2)
        final_four_winners = simulate_round([(elite_eight_winners[i], elite_eight_winners[i + 1]) for i in range(0, len(elite_eight_winners), 2)], round_factor=1.6)
        championship_game = [(final_four_winners[0], final_four_winners[1])]
        champion = simulate_round(championship_game, round_factor=2.0)[0]

        final_four_counter.update(final_four_winners)
        champion_counter[champion] += 1

    # Display results
    print("Final Four Appearances:")
    for team, appearances in final_four_counter.most_common():
        print(f"{team}: {appearances}")
    print("\nChampionship Wins:")
    for team, wins in champion_counter.most_common():
        print(f"{team}: {wins}")

run_tournament_simulation()

Final Four Appearances:
Houston Cougars: 100000
Arizona Wildcats: 84227
University of Connecticut Huskies: 11197
Illinois Fighting Illini: 4572
Iowa State Cyclones: 3
Alabama Crimson Tide: 1

Championship Wins:
Houston Cougars: 92314
Arizona Wildcats: 7623
University of Connecticut Huskies: 62
Illinois Fighting Illini: 1


In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [44]:
# Assuming `team_stats` is your dictionary with updated stats
# and `sweet_16_matchups` is a list of matchups

simulated_games = []
for matchup in sweet_16_matchups:
    team1, team2 = matchup
    team1_stats = team_stats[team1]
    team2_stats = team_stats[team2]
    game_dict = {
        'team1_3PT': team1_stats['3PT'],
        'team1_AST_TO': team1_stats['AST/TO'],
        'team1_TRB%': team1_stats['TRB%'],
        'team1_DEF_EFF': team1_stats['DEF EFF'],
        'team1_OFF_EFF': team1_stats['OFF EFF'],
        # Repeat for team2
        'team2_3PT': team2_stats['3PT'],
        # Add more stats...
        'winner': 1 if simulate_game(team1, team2) == team1 else 0
    }
    simulated_games.append(game_dict)

# Convert to DataFrame, encode categorical data, and proceed with model training as before



The predicted winner between North Carolina Tar Heels and Arizona Wildcats is: Arizona Wildcats


In [57]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# Get all unique team names
all_teams = list(team_stats.keys())

# Fit the LabelEncoder on all team names
label_encoder = LabelEncoder()
label_encoder.fit(all_teams)

# Prepare the data
simulated_games = []
for matchup in sweet_16_matchups:
    team1, team2 = matchup
    winner = simulate_game(team1, team2)
    team1_stats = team_stats[team1]
    team2_stats = team_stats[team2]
    game_dict = {
        'team1': team1,
        'team1_3PT': team1_stats['3PT'],
        'team1_AST/TO': team1_stats['AST/TO'],
        # Add more team1 stats...
        'team2': team2,
        'team2_3PT': team2_stats['3PT'],
        'team2_AST/TO': team2_stats['AST/TO'],
        # Add more team2 stats...
        'winner': 1 if winner == team1 else 0
    }
    simulated_games.append(game_dict)

data = pd.DataFrame(simulated_games)

# Encode team names
data['team1'] = label_encoder.transform(data['team1'])
data['team2'] = label_encoder.transform(data['team2'])

# Feature engineering
data['pt_diff'] = data['team1_3PT'] - data['team2_3PT']
data['ast_to_diff'] = data['team1_AST/TO'] - data['team2_AST/TO']
# Add more feature engineering as needed

# Split the data
X = data.drop('winner', axis=1)
y = data['winner']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Choose a model (Random Forest)
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Function to preprocess new game data and make predictions
def predict_game(team1, team2):
    team1_stats = team_stats[team1]
    team2_stats = team_stats[team2]
    game_data = {
        'team1': label_encoder.transform([team1])[0],
        'team1_3PT': team1_stats['3PT'],
        'team1_AST/TO': team1_stats['AST/TO'],
        # Add more team1 stats...
        'team2': label_encoder.transform([team2])[0],
        'team2_3PT': team2_stats['3PT'],
        'team2_AST/TO': team2_stats['AST/TO'],
        # Add more team2 stats...
    }
    # Add feature engineering for new data
    game_data['pt_diff'] = game_data['team1_3PT'] - game_data['team2_3PT']
    game_data['ast_to_diff'] = game_data['team1_AST/TO'] - game_data['team2_AST/TO']
    # Create a DataFrame and make a prediction
    game_df = pd.DataFrame([game_data])
    prediction = model.predict(game_df)
    winner_index = prediction[0]
    winner = label_encoder.inverse_transform([winner_index])[0]
    return winner

# Use the predict_game function to make predictions
team1 = 'North Carolina Tar Heels'
team2 = 'Arizona Wildcats'
winner = predict_game(team1, team2)
print(f"The predicted winner between {team1} and {team2} is: {winner}")

# Predict the winnder of the tournament
def predict_tournament_winner():
    final_four_teams = simulate_round(sweet_16_matchups)
    championship_teams = simulate_round([(final_four_teams[i], final_four_teams[i + 1]) for i in range(0, len(final_four_teams), 2)])
    champion = simulate_round([(championship_teams[0], championship_teams[1])])[0]
    return champion

predicted_champion = predict_tournament_winner()
print(f"The predicted champion of the tournament is: {predicted_champion}")



The predicted winner between North Carolina Tar Heels and Arizona Wildcats is: Arizona Wildcats
The predicted champion of the tournament is: Arizona Wildcats


In [52]:
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.5


In [53]:
# tune the model
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=3, n_jobs=-1, verbose=2)

# Perform the grid search
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")


Fitting 3 folds for each of 36 candidates, totalling 108 fits




[CV] END max_depth=None, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_split=2, n_estimators=150; total time=   0.1s
[CV] END max_depth=None, min_samples_split=2, n_estimators=150; total time=   0.1s
[CV] END max_depth=None, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, min_samples_split=5, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, min_samples_split=5, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, min_samples_split=5, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, min_samples_split=2, n_estimators=150; total time=   0.1s
[CV] END m

In [54]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder

# Get all unique team names
all_teams = list(team_stats.keys())

# Fit the LabelEncoder on all team names
label_encoder = LabelEncoder()
label_encoder.fit(all_teams)

# Prepare the data
simulated_games = []
for matchup in sweet_16_matchups:
    team1, team2 = matchup
    winner = simulate_game(team1, team2)
    team1_stats = team_stats[team1]
    team2_stats = team_stats[team2]
    game_dict = {
        'team1': team1,
        'team1_3PT': team1_stats['3PT'],
        'team1_AST/TO': team1_stats['AST/TO'],
        # Add more team1 stats...
        'team2': team2,
        'team2_3PT': team2_stats['3PT'],
        'team2_AST/TO': team2_stats['AST/TO'],
        # Add more team2 stats...
        'winner': 1 if winner == team1 else 0
    }
    simulated_games.append(game_dict)

data = pd.DataFrame(simulated_games)

# Encode team names
data['team1'] = label_encoder.transform(data['team1'])
data['team2'] = label_encoder.transform(data['team2'])

# Feature engineering
data['pt_diff'] = data['team1_3PT'] - data['team2_3PT']
data['ast_to_diff'] = data['team1_AST/TO'] - data['team2_AST/TO']
# Add more feature engineering as needed

# Split the data
X = data.drop('winner', axis=1)
y = data['winner']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define the scoring metric (e.g., ROC AUC)
scoring = 'roc_auc'

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'class_weight': ['balanced']  # Add class weights to handle imbalance
}

# Create the GridSearchCV object with Stratified K-Fold cross-validation
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           scoring=scoring,
                           cv=cv,
                           n_jobs=-1,
                           verbose=2)

# Perform the grid search
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-Score: {f1:.3f}")
print(f"ROC AUC: {roc_auc:.3f}")

Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV] END class_weight=balanced, max_depth=None, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END class_weight=balanced, max_depth=None, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END class_weight=balanced, max_depth=None, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END class_weight=balanced, max_depth=None, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END class_weight=balanced, max_depth=None, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END class_weight=balanced, max_depth=None, min_samples_split=5, n_estimators=50; total time=   0.0s
[CV] END class_weight=balanced, max_depth=None, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END class_weight=balanced, max_depth=None, min_samples_split=5, n_estimators=50; total time=   0.0s
[CV] END class_weight=balanced, max_depth=None, min_samples_split=5, n_estimators=50; total tim

Traceback (most recent call last):
  File "/Users/katadhin1/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 982, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/katadhin1/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 253, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/katadhin1/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 345, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/Users/katadhin1/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 87, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/katadhin1/anaconda3/lib/python3.11/site-packages/sk

[CV] END class_weight=balanced, max_depth=None, min_samples_split=5, n_estimators=100; total time=   0.1s
[CV] END class_weight=balanced, max_depth=None, min_samples_split=10, n_estimators=50; total time=   0.0s
[CV] END class_weight=balanced, max_depth=None, min_samples_split=5, n_estimators=100; total time=   0.1s
[CV] END class_weight=balanced, max_depth=None, min_samples_split=10, n_estimators=50; total time=   0.0s
[CV] END class_weight=balanced, max_depth=None, min_samples_split=2, n_estimators=150; total time=   0.1s
[CV] END class_weight=balanced, max_depth=None, min_samples_split=10, n_estimators=100; total time=   0.1s
[CV] END class_weight=balanced, max_depth=None, min_samples_split=5, n_estimators=150; total time=   0.1s
[CV] END class_weight=balanced, max_depth=None, min_samples_split=10, n_estimators=50; total time=   0.1s
[CV] END class_weight=balanced, max_depth=None, min_samples_split=5, n_estimators=150; total time=   0.1s
[CV] END class_weight=balanced, max_depth=10,

Traceback (most recent call last):
  File "/Users/katadhin1/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 982, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/katadhin1/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 253, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/katadhin1/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 350, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/katadhin1/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
           

[CV] END class_weight=balanced, max_depth=10, min_samples_split=2, n_estimators=150; total time=   0.2s
[CV] END class_weight=balanced, max_depth=None, min_samples_split=10, n_estimators=150; total time=   0.1s
[CV] END class_weight=balanced, max_depth=10, min_samples_split=10, n_estimators=50; total time=   0.1s
[CV] END class_weight=balanced, max_depth=10, min_samples_split=2, n_estimators=150; total time=   0.1s
[CV] END class_weight=balanced, max_depth=10, min_samples_split=5, n_estimators=100; total time=   0.1s
[CV] END class_weight=balanced, max_depth=10, min_samples_split=10, n_estimators=50; total time=   0.0s
[CV] END class_weight=balanced, max_depth=10, min_samples_split=5, n_estimators=100; total time=   0.1s
[CV] END class_weight=balanced, max_depth=10, min_samples_split=5, n_estimators=150; total time=   0.1s
[CV] END class_weight=balanced, max_depth=10, min_samples_split=5, n_estimators=150; total time=   0.1s
[CV] END class_weight=balanced, max_depth=20, min_samples_spl

Traceback (most recent call last):
  File "/Users/katadhin1/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 982, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/katadhin1/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 253, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/katadhin1/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 350, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/katadhin1/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
           

[CV] END class_weight=balanced, max_depth=10, min_samples_split=10, n_estimators=150; total time=   0.1s
[CV] END class_weight=balanced, max_depth=20, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END class_weight=balanced, max_depth=20, min_samples_split=10, n_estimators=100; total time=   0.1s
[CV] END class_weight=balanced, max_depth=20, min_samples_split=5, n_estimators=100; total time=   0.1s
[CV] END class_weight=balanced, max_depth=10, min_samples_split=10, n_estimators=100; total time=   0.1s
[CV] END class_weight=balanced, max_depth=20, min_samples_split=10, n_estimators=50; total time=   0.1s
[CV] END class_weight=balanced, max_depth=20, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END class_weight=balanced, max_depth=20, min_samples_split=2, n_estimators=150; total time=   0.1s
[CV] END class_weight=balanced, max_depth=20, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END class_weight=balanced, max_depth=20, min_samples_split

Traceback (most recent call last):
  File "/Users/katadhin1/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 982, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/katadhin1/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 253, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/katadhin1/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 350, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/katadhin1/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
           

[CV] END class_weight=balanced, max_depth=20, min_samples_split=5, n_estimators=50; total time=   0.0s
[CV] END class_weight=balanced, max_depth=30, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END class_weight=balanced, max_depth=30, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END class_weight=balanced, max_depth=20, min_samples_split=5, n_estimators=150; total time=   0.1s
[CV] END class_weight=balanced, max_depth=30, min_samples_split=2, n_estimators=150; total time=   0.1s
[CV] END class_weight=balanced, max_depth=20, min_samples_split=10, n_estimators=150; total time=   0.1s
[CV] END class_weight=balanced, max_depth=30, min_samples_split=5, n_estimators=50; total time=   0.1s
[CV] END class_weight=balanced, max_depth=30, min_samples_split=5, n_estimators=50; total time=   0.1s
[CV] END class_weight=balanced, max_depth=30, min_samples_split=2, n_estimators=150; total time=   0.1s
[CV] END class_weight=balanced, max_depth=30, min_samples_split=2,

Traceback (most recent call last):
  File "/Users/katadhin1/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 982, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/katadhin1/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 253, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/katadhin1/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 350, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/katadhin1/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
           

In [48]:
def run_tournament_simulation(num_simulations=100000):
    final_four_counter = Counter()
    champion_counter = Counter()

    for _ in range(num_simulations):
        # Reset momentum for each simulation
        for team in team_stats:
            team_stats[team]['momentum'] = random.uniform(0.9, 1.1)

        # Simulate Sweet Sixteen round
        sweet_sixteen_winners = []
        for matchup in sweet_16_matchups:
            team1, team2 = matchup
            winner = predict_game(team1, team2)
            sweet_sixteen_winners.append(winner)

        # Simulate Elite Eight round
        elite_eight_winners = []
        for i in range(0, len(sweet_sixteen_winners), 2):
            team1 = sweet_sixteen_winners[i]
            team2 = sweet_sixteen_winners[i + 1]
            winner = predict_game(team1, team2)
            elite_eight_winners.append(winner)

        # Simulate Final Four round
        final_four_winners = []
        for i in range(0, len(elite_eight_winners), 2):
            team1 = elite_eight_winners[i]
            team2 = elite_eight_winners[i + 1]
            winner = predict_game(team1, team2)
            final_four_winners.append(winner)

        # Simulate Championship game
        championship_game = [final_four_winners[0], final_four_winners[1]]
        champion = predict_game(*championship_game)

        final_four_counter.update(final_four_winners)
        champion_counter[champion] += 1

    # Display results
    print("Final Four Appearances:")
    for team, appearances in final_four_counter.most_common():
        print(f"{team}: {appearances}")
    print("\nChampionship Wins:")
    for team, wins in champion_counter.most_common():
        print(f"{team}: {wins}")

run_tournament_simulation()




Final Four Appearances:
Arizona Wildcats: 200000

Championship Wins:
Arizona Wildcats: 100000
