# 📓 01_player_sampling_and_stats.ipynb
**Purpose:** Load player data, inspect it, and prepare for performance modeling.

In [None]:
import pickle
import random
import numpy as np
import sys
from pathlib import Path
import importlib
import pandas as pd

# Ensure correct path
sys.path.append(str(Path().resolve().parent / "src"))

# Reload golf_classes FIRST
import golf_classes
importlib.reload(golf_classes)

# THEN import the class definitions
from golf_classes import Player, PlayerRoundInfo, Tournament, Round, MMTeam, CTeam

# Reload other modules AFTER class objects are clean
import golf_utils
importlib.reload(golf_utils)
import golf_scoring
importlib.reload(golf_scoring)

# THEN import your functions
from golf_utils import get_player_by_name


from golf_scoring import compute_real_stableford, simulate_round,  stableford_points, calibrated_simulate_round, hole_pars

from golf_utils import save_pickle, load_pickle, compute_all_sandbag_factors, rebind_team_players, score_randomness_test, sandbag_report, print_top_sandbaggers


In [None]:
# Reload other modules AFTER class objects are clean


# THEN import your functions
from golf_utils import get_player_by_name, get_close_player_matches

In [None]:
# File: calcutta_data_players.pkl
player_data = {
    "players": {},       # str -> Player
    "tournaments": {}    # str -> Tournament
}

# File: calcutta_data_teams.pkl
team_data = {
    "mm_teams": {},      # str -> MMTeam
    "c_teams": {}        # str -> CTeam
}



players = player_data["players"]
tournaments = player_data["tournaments"]
mm_teams = team_data["mm_teams"]
c_teams = team_data["c_teams"]

In [None]:

from pathlib import Path

data_dir = Path().resolve().parent / "Data"


golf_player_data_file = data_dir / "golf_player_data.pkl"
golf_team_data_file = data_dir / "golf_team_data.pkl"

In [None]:
def reset_all_data():
    global players, tournaments, mm_teams, c_teams
    players.clear()
    tournaments.clear()
    mm_teams.clear()
    c_teams.clear()
    print("🧹 All data has been reset.")

In [None]:
import os
print(os.path.exists(golf_player_data_file))


In [None]:
reset_all_data()

# Load data
player_data = load_pickle(golf_player_data_file) or {"players": {}, "tournaments": {}}
team_data = load_pickle(golf_team_data_file) or {"mm_teams": {}, "c_teams": {}}

# Access individual components
players = player_data["players"]
tournaments = player_data["tournaments"]
mm_teams = team_data["mm_teams"]
c_teams = team_data["c_teams"]

rebind_team_players(players, mm_teams, c_teams)

passCount, errorCount = compute_all_sandbag_factors(players,2,.5)

print(f"players : with SB Factors {passCount}  : without SB factors {errorCount}")

team_with_sbf = 0
team_without_sbf = 0

for team in mm_teams.values():
    team.compute_aggregate_sand_bag_factor()
    if team.players[0].sand_bag_factor is None or team.players[1].sand_bag_factor is None :
        team_without_sbf += 1
    else:
        team_with_sbf += 1
        
print(f"Loaded {len(mm_teams)} mm teams : with sbf {team_with_sbf}, without sbf {team_without_sbf} and {len(c_teams)} c teams")

print(f"Loaded {len(players)} players and {len(tournaments)} tournaments")

In [None]:
# --- Inspect Example Player ---
example_player = next(iter(players.values() if isinstance(players, dict) else players))

print(f"Example Player: {example_player.name}")
print(f"Total Rounds Recorded: {len(example_player.rounds)}")

# Print some details for the first few rounds
for i, round_info in enumerate(example_player.rounds[:5]):  # Show up to 5 rounds
    print(f" Round {i+1}:")
    print(f"  Tournament: {round_info.tournament_name}")
    print(f"  Round Number: {round_info.round_number}")
    print(f"  Handicap: {round_info.handicap}")
    print(f"  Tee: {round_info.tee}")
    print(f"  Net Score: {round_info.net}")
    print(f"  Total Gross Score: {round_info.total}")

In [None]:
# --- Statistics ---

# Make sure players is either a list or dictionary
player_list = players.values() if isinstance(players, dict) else players

# Number of players
num_players = len(player_list)

# Number of rounds per player
rounds_per_player = [len(player.rounds) for player in player_list]

# All net scores and valid handicaps across all players and rounds
all_net_scores = []
all_handicaps = []

for player in player_list:
    for round_info in player.rounds:
        all_net_scores.append(round_info.net)
        if pd.notnull(round_info.handicap):
            all_handicaps.append(round_info.handicap)

# Print statistics
print(f"\nStatistics:")
print(f"Total Players: {num_players}")
print(f"Average Rounds per Player: {np.mean(rounds_per_player):.2f}")
print(f"Min Rounds per Player: {np.min(rounds_per_player)}")
print(f"Max Rounds per Player: {np.max(rounds_per_player)}")

valid_net_scores = [s for s in all_net_scores if s is not None and isinstance(s, (int, float))]
if valid_net_scores:
    print(f"\nNet Score Statistics:")
    print(f"  Average Net Score: {np.mean(valid_net_scores):.2f}")
    print(f"  Net Score Range: {np.min(valid_net_scores)} to {np.max(valid_net_scores)}")

valid_handicaps = [h for h in all_handicaps if h is not None and isinstance(h, (int, float))]
if valid_handicaps:
    print(f"\nHandicap Statistics:")
    print(f"  Average Handicap: {np.mean(valid_handicaps):.2f}")
    print(f"  Handicap Range: {np.min(valid_handicaps)} to {np.max(valid_handicaps)}")
else:
    print("\n⚠️ Warning: No valid handicaps found.")


In [None]:

import matplotlib.pyplot as plt
# --- Plot Net Score Distribution ---
plt.hist(valid_net_scores, bins=30, edgecolor='black')
plt.title('Distribution of Net Scores (All Players)')
plt.xlabel('Net Score')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# --- Plot Handicap Distribution ---
plt.hist(valid_handicaps, bins=range(int(min(all_handicaps)) - 1, int(max(all_handicaps)) + 2), edgecolor='black')
plt.title('Distribution of Handicaps (All Players)')
plt.xlabel('Handicap')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
# --- Optional: Plot rounds per player ---
plt.hist(rounds_per_player, bins=range(0, max(rounds_per_player)+5, 2), edgecolor='black')
plt.title('Distribution of Rounds per Player')
plt.xlabel('Number of Rounds')
plt.ylabel('Number of Players')
plt.grid(True)
plt.show()

In [None]:
# --- Ready for Sampling Model ---

def sample_score_simple(player):
    return random.choice(player.historical_scores)

def sample_score_weighted(player, decay_factor=0.9):
    scores = player.historical_scores
    n = len(scores)
    weights = [decay_factor ** (n - i - 1) for i in range(n)]
    return random.choices(scores, weights=weights, k=1)[0]

In [None]:
# --- Test Sampling ---
print("\n🧪 Sampling Test:")
print(f"Simple Sample for {example_player.name}: {sample_score_simple(example_player)}")
print(f"Weighted Sample for {example_player.name}: {sample_score_weighted(example_player)}")

In [None]:
# --- Save player names (optional) ---
player_names = [p.name for p in (players.values() if isinstance(players, dict) else players)]

In [None]:
def inspect_player_net_stats(player_name):
    import numpy as np

    player = players.get(player_name)
    if not player:
        print(f"❌ Player '{player_name}' not found.")
        return

    indiv_nets = [
        r.net for r in player.rounds
        if r.tournament_name.lower() == "individual" and r.completed and isinstance(r.net, (int, float))
    ]
    tour_nets = [
        r.net for r in player.rounds
        if r.tournament_name.lower() != "individual" and r.completed and isinstance(r.net, (int, float))
    ]

    print(f"\n📋 Net Score Summary for {player.name}")
    print(f"  ✅ Completed Individual Rounds: {len(indiv_nets)}")
    print(f"  ✅ Completed Tournament Rounds: {len(tour_nets)}")

    if indiv_nets:
        avg_indiv = np.mean(indiv_nets)
        print(f"  🧍 Avg Individual Net: {avg_indiv:.2f}")
    else:
        avg_indiv = None
        print("  🧍 No individual net scores found.")

    if tour_nets:
        avg_tour = np.mean(tour_nets)
        print(f"  🏌️ Avg Tournament Net: {avg_tour:.2f}")
    else:
        avg_tour = None
        print("  🏌️ No tournament net scores found.")

    if avg_indiv is not None and avg_tour is not None:
        print(f"  🔁 Difference (Individual - Tournament): {avg_indiv - avg_tour:.2f}")

In [None]:
inspect_player_net_stats("Mike Weiss")

In [None]:
def compare_net_stats_individual_vs_tournament(top_n=20, min_rounds=2):
    import numpy as np

    player_diffs = []
    all_avg_tour = []
    all_avg_indiv = []

    for player in players.values():
        indiv_nets = [
            r.net for r in player.rounds
            if r.tournament_flag == False and r.completed and isinstance(r.net, (int, float))
        ]
        tour_nets = [
            r.net for r in player.rounds
            if r.tournament_flag == True and r.completed and isinstance(r.net, (int, float))
        ]

        if len(indiv_nets) >= min_rounds and len(tour_nets) >= min_rounds:
            avg_indiv = np.mean(indiv_nets)
            avg_tour = np.mean(tour_nets)
            diff = avg_indiv - avg_tour
            all_avg_tour.append(avg_tour)
            all_avg_indiv.append(avg_indiv)
            player_diffs.append((player.name, avg_tour, avg_indiv, diff, len(tour_nets), len(indiv_nets)))

    # Sort by absolute difference, descending
    player_diffs.sort(key=lambda x: x[3], reverse=True)

    # Print
    header = "{:<5} {:<25} {:>15} {:>15} {:>12} {:>10} {:>10}".format(
        "Rank", "Player", "Avg Tournament", "Avg Individual", "Difference", "Tour Rds", "Indiv Rds"
    )
    print(f"\n📊 Top {top_n} Players with Largest Net Score Differences (Min {min_rounds} Rounds Each):\n")
    print(header)
    print("-" * len(header))

    for i, (name, avg_tour, avg_indiv, diff, tour_ct, indiv_ct) in enumerate(player_diffs[:top_n], start=1):
        print("{:<5} {:<25} {:>15.2f} {:>15.2f} {:>12.2f} {:>10} {:>10}".format(
            i, name, avg_tour, avg_indiv, diff, tour_ct, indiv_ct
        ))

In [None]:
# --- Example Usage ---
print(f"players: {len(players)}, teams: {len(mm_teams)}, tournaments: {len(tournaments)}")

total_players = len(players)
compare_net_stats_individual_vs_tournament(top_n=50, min_rounds=3)

In [None]:
def calculate_and_print_player_net_stats(top_n=20, min_rounds=5):
    import numpy as np

    player_stats = []
    player_list = players.values() if isinstance(players, dict) else players

    for player in player_list:
        net_scores = [
            r.net for r in player.rounds
            if r.net is not None and isinstance(r.net, (int, float)) and getattr(r, "completed", False)
        ]
        if len(net_scores) >= min_rounds:
            avg_net = np.mean(net_scores)
            std_net = np.std(net_scores)
            player_stats.append((player.name, avg_net, std_net, len(net_scores)))

    player_stats.sort(key=lambda x: x[1])

    header = "{:<5} {:<25} {:>15} {:>12} {:>10}".format("Rank", "Player Name", "Avg Net Score", "Std Dev", "Rounds")
    print(f"\n🏌️ Top {top_n} Players by Average Net Score (Minimum {min_rounds} Completed Rounds):\n")
    print(header)
    print("-" * len(header))

    for idx, (name, avg_score, std_dev, num_rounds) in enumerate(player_stats[:top_n], start=1):
        row = "{:<5} {:<25} {:>15.2f} {:>12.2f} {:>10}".format(idx, name, avg_score, std_dev, num_rounds)
        print(row)

In [None]:
def print_cteam_sbf_summary(cteam):
    print(f"\n🟦 CTeam: {cteam.name}")
    for i, mm_team in enumerate(cteam.mm_teams, 1):
        print(f"  ├─ MMTeam {i}: {mm_team.name}")
        for player in mm_team.players:
            sbf = player.sand_bag_factor
            sbf_str = f"{sbf:.2f}" if sbf is not None else "None"
            print(f"  │   ├─ Player: {player.name}, SBF: {sbf_str}")
        agg_sbf = mm_team.aggregate_sand_bag_factor
        agg_str = f"{agg_sbf:.2f}" if agg_sbf is not None else "None"
        print(f"  │   └─ MMTeam Aggregate SBF: {agg_str}")
    agg_cteam = cteam.aggregate_sand_bag_factor
    agg_cteam_str = f"{agg_cteam:.2f}" if agg_cteam is not None else "None"
    print(f"  └─ CTeam Aggregate SBF (best 2 MMTeams): {agg_cteam_str}")


In [None]:

# --- Example Usage ---
print(f"players: {len(players)}, teams: {len(mm_teams)}, tournaments: {len(tournaments)}")
calculate_and_print_player_net_stats(top_n=50, min_rounds=4)

In [None]:
def rank_cteams_by_sandbag_factor(c_teams, top_n=10):
    """
    Compute and rank CTeams by their aggregate sand_bag_factor.

    Args:
        c_teams (dict[str, CTeam]): Mapping of CTeam names to CTeam objects.
        top_n (int): Number of teams to print (sorted by lowest sand_bag_factor).
    """
    # Step 1: Compute all MMTeam sandbag factors
    for cteam in c_teams.values():
        for mm_team in cteam.mm_teams:
            mm_team.compute_aggregate_sand_bag_factor()
            #print(f"team: {mm_team.name} team sb : {mm_team.aggregate_sand_bag_factor}, player1 {mm_team.players[0].name} : sb factor {mm_team.players[0].sand_bag_factor}")

    # Step 2: Compute CTeam aggregate scores
    for cteam in c_teams.values():
        cteam.compute_aggregate_score()
        if cteam.aggregate_sand_bag_factor is None:
           print(f"team: {cteam.name} team sb : None") 

    # Step 3: Filter teams with valid aggregate values
    ranked = [cteam for cteam in c_teams.values() if cteam.aggregate_sand_bag_factor is not None]

    # Step 4: Sort by sand_bag_factor (lower is better — more sandbaggy)
    ranked.sort(key=lambda x: x.aggregate_sand_bag_factor)

    # Step 5: Display
    print(f"\n🏆 Top {min(top_n, len(ranked))} CTeams by Sandbag Factor:")
    for i, team in enumerate(ranked[:top_n], start=1):
        print_cteam_sbf_summary(team)


In [None]:
def rank_mmteams_by_sandbag_factor(c_teams, top_n=10):
    """
    Compute and rank CTeams by their aggregate sand_bag_factor.

    Args:
        c_teams (dict[str, CTeam]): Mapping of CTeam names to CTeam objects.
        top_n (int): Number of teams to print (sorted by lowest sand_bag_factor).
    """
    # Step 1: Compute all MMTeam sandbag factors
    for cteam in c_teams.values():
        for mm_team in cteam.mm_teams:
            mm_team.compute_aggregate_sand_bag_factor()
            #print(f"team: {mm_team.name} team sb : {mm_team.aggregate_sand_bag_factor}, player1 {mm_team.players[0].name} : sb factor {mm_team.players[0].sand_bag_factor}")

    # Step 2: Compute CTeam aggregate scores
    for cteam in c_teams.values():
        cteam.compute_aggregate_score()
        if cteam.aggregate_sand_bag_factor is None:
           print(f"team: {cteam.name} team sb : None") 

    # Step 3: Filter teams with valid aggregate values
    ranked = [cteam for cteam in c_teams.values() if cteam.aggregate_sand_bag_factor is not None]

    # Step 4: Sort by sand_bag_factor (lower is better — more sandbaggy)
    ranked.sort(key=lambda x: x.aggregate_sand_bag_factor)

    # Step 5: Display
    print(f"\n🏆 Top {min(top_n, len(ranked))} CTeams by Sandbag Factor:")
    for i, team in enumerate(ranked[:top_n], start=1):
        print_cteam_sbf_summary(team)


In [None]:
print(f"Loaded {len(mm_teams)} mm teams and {len(c_teams)} c teams")
print(f"Loaded {len(players)} players and {len(tournaments)} tournaments")

rank_cteams_by_sandbag_factor(c_teams, top_n=20)


In [None]:
valid_players = [p for p in players.values() if p.sand_bag_factor is not None]
print(f"✅ {len(valid_players)} players have valid sand_bag_factor")

valid_mm_teams = [m for m in mm_teams.values() if m.aggregate_sand_bag_factor is not None]
print(f"✅ {len(valid_mm_teams)} MMTeams have valid aggregate_sand_bag_factor")

valid_c_teams = [c for c in c_teams.values() if c.aggregate_sand_bag_factor is not None]
print(f"✅ {len(valid_c_teams)} CTeams have valid aggregate_sand_bag_factor")




In [None]:
for name, mm in mm_teams.items():
    p1, p2 = mm.players
    if p1.sand_bag_factor is None or p2.sand_bag_factor is None:
        print(f"❌ {name}: {p1.name}={p1.sand_bag_factor}, {p2.name}={p2.sand_bag_factor}")


In [None]:
for name, mm in mm_teams.items():
    p1, p2 = mm.players
    p1.compute_sand_bag_factor()
    p2.compute_sand_bag_factor()
    if p1.sand_bag_factor is None or p2.sand_bag_factor is None:
        print(f"❌ {name}: {p1.name}={p1.sand_bag_factor}, {p2.name}={p2.sand_bag_factor}")

In [None]:
player_name = "Robby Tonkin"         # your Player object
player = players.get(player_name)
if not player:
    print(f"❌ Player '{player_name}' not found.")
else:        
    result = score_randomness_test(player)
    print(result)
    if result.get("p_mwu", 1) < 0.05:
        print(f"{sandbag_report(result,player_name)}")

In [None]:

print_top_sandbaggers(players, n=30, alpha=0.75)

In [None]:
# Jupyter Notebook Cell: Debug a Single Player with Detailed Averages

# 1) Choose the player you want to inspect:
player_name  = "Mike Weiss"    # ← change this to any name in your `players` dict
min_rounds   = 5
scale_factor = 0.5

# 2) Pull the Player object
player = players[player_name]

# 3) Gather tournament & casual net scores
t_scores = [r.net for r in player.rounds if r.tournament_flag and r.net is not None]
c_scores = [r.net for r in player.rounds if not r.tournament_flag and r.net is not None]

# 4) Compute averages (guarding against zero‐division)
mean_tourn = sum(t_scores) / len(t_scores) if t_scores else float('nan')
mean_casual = sum(c_scores) / len(c_scores) if c_scores else float('nan')

# 5) Raw delta and scaling decision
raw_delta = mean_tourn - mean_casual
did_scale = (len(t_scores) < min_rounds) or (len(c_scores) < min_rounds)
final_delta = raw_delta * (scale_factor if did_scale else 1.0)

# 6) Print out all the details
print(f"🔍 Debugging {player.name}")
print(f"  • Tournament rounds : {len(t_scores)}   average score = {mean_tourn:.2f}")
print(f"  • Casual rounds     : {len(c_scores)}   average score = {mean_casual:.2f}")
print(f"  • Raw delta (tour - casual)           = {raw_delta:.2f}")
print(f"  • Meets min_rounds?                    {(not did_scale)}  (min_rounds = {min_rounds})")
print(f"  • Scale factor applied                = {scale_factor if did_scale else 1.0}")
print(f"  • Final sand_bag_factor               = {final_delta:.2f}")

# 7) (Optional) override the Player's attribute if you want to store it
player.sand_bag_factor = final_delta

In [None]:
# pull out the Player object
player = players["Mike Weiss"]

# call with your chosen thresholds
player.compute_sand_bag_factor_print(min_rounds=5, scale_factor=0.5)

# inspect the result (and you’ll also see the debug print you added)
print(player.name, "→ sand_bag_factor =", player.sand_bag_factor)

In [None]:
# Debug Mike Weiss with the exact same filters as compare_net_stats_individual_vs_tournament
player_name  = "Mike Weiss"
min_rounds   = 2     # same default you used earlier
scale_factor = 0.5   # if you ever need it

player = players[player_name]

# exactly the same filtering as your compare function
t_scores = [
    r.net for r in player.rounds
    if r.tournament_flag and r.completed and isinstance(r.net, (int, float))
]
c_scores = [
    r.net for r in player.rounds
    if not r.tournament_flag and r.completed and isinstance(r.net, (int, float))
]

mean_tourn = sum(t_scores) / len(t_scores) if t_scores else float('nan')
mean_casual = sum(c_scores) / len(c_scores) if c_scores else float('nan')
raw_delta = mean_tourn - mean_casual
did_scale = (len(t_scores) < min_rounds) or (len(c_scores) < min_rounds)
final_delta = raw_delta * (scale_factor if did_scale else 1.0)

print(f"🔍 Debugging {player.name}")
print(f"  • Tournament rounds : {len(t_scores)}   average = {mean_tourn:.2f}")
print(f"  • Casual rounds     : {len(c_scores)}   average = {mean_casual:.2f}")
print(f"  • Raw (tour – casual)= {raw_delta:.2f}")
print(f"  • Meets min_rounds?   {(not did_scale)}  (min_rounds={min_rounds})")
print(f"  • Applied scale?      {scale_factor if did_scale else 1.0}")
print(f"  • sand_bag_factor     = {final_delta:.2f}")


In [None]:
# Jupyter Notebook Cell: Match Names, Show sand_bag_factor, and Include Net‐Stats Comparison

import pandas as pd
import numpy as np
import importlib
#importlib.reload(ace_tools)
#import ace_tools as tools

# 1. Your list of names:
names = [
"Eric Browne", "Jeff Eulberg", "Jim Foody", "Nicholas Oki", "Jeff Johnson",
"David McAughan", "Tony Johnson", "James Bennett", "Blake Knox", "Brad Thoreson",
"Rick Barnett", "Greg Richards", "Jon Reingold", "Matt Tabler", "Patrick Angelel",
"Mark Laura", "Kevin Oakes", "Joe Lynch", "Mark Bath", "Tom Peters",
"Brandon Odom", "Johnathon Laura", "Lance Rounds", "Colin Mooney", "Damon Nasman",
"Alexander Oki", "Augie Johnson", "Craig Bevan", "Mike Crosetto", "Steven McKean",
"Kent Fisher", "Gary Furukawa", "Ike Lee", "James Ryder", "Bruce Burks",
"Will Nelson", "Jim Watson", "John Andrews", "Jim Ridgeway", "Joe Razore",
"Michael Angiulo", "Omar Mawjee", "John Harnish", "Brock Meitl", "John Culver",
"Wyatt Hay", "Tim O'Keefe", "Alexander Kruse", "Tom Weeks", "Chris Taylor",
"Ray Szylko", "Matt Yerbic", "Jack Bumgardner", "Dave West", "Michael Heijer",
"Ben Johnson", "Mike Weiss", "Steven Petrie", "Bradley Scott", "Yusuf Mehdi",
"Robin Easton", "John Bentz", "Bob Whitsitt", "Jason Kono", "Peter Woodward",
"Nick Rensch", "Jason Furr", "Cam Warren", "Al Clise", "Hal Wright",
"Justin Madison", "Todd Parsons", "Bill Reller", "Bijal Shah", "William Richter",
"Timothy Parker", "Brad Pratt", "Jimmy Lake Jr.", "Kumar Mehta", "Scott Whittlesey",
"Jeff Roberts", "Mitch Mounger", "Andrew Moore", "Blake Bentz", "Sean Whitsitt",
"Isaac Hammer", "Matt Nieman", "Christian Gerron", 
"Ray Sato", "Larry Meitl", "Tracy Quickstad", "Craig Suhrbier", "Greg McNabb",
"Steve Savard", "Rod Olson", "Karlen Rothenbueler", "Pat Chun", "Robert Bach",
"Peter Faricy", "Jj Sato", "Mark Pigott", "John Murray", "Tim Tasker",
"Carl Lovsted", "Robin Sloane", "Jack Savard", "Steve Poore", "Ryan Evans",
"Cole Thompson", "Jamie Nordstrom", "Paul Nicely", "Pete Bryant", "Michael Crowson",
"Robert Manlowe", "Seth Patton", "Mike Wiesmann", "Chris Steffanci", "Kevin Hughes",
"Marin Guillet", "Tim Rattigan", "David Paisley", "John Thayer", "Rick Applegate",
"Darren Alger", "Jim Freer", "Rob Schwartz", "Matt Nickerson", "Greg Komen",
"Andy Tonkin", "Connor McClain", "Michael Raine", "Tony Bacon", "Nicholas Malshuk",
"Roger Kuula", "Mark Tsang", "Franklin Fite", "Jim Neal", "Brian Biege",
"Damon Huard", "Henry Albrecht", "Bob Nieman", "Alex Smith", "Lew Thayer",
"Kaine Kornegay", "Todd Marker", "Stan Freimuth", "Satoshi Nakajima", "Fred Devereux",
"Steven Goldfarb", "Robby Tonkin", "Colin Dresser", "Dean Poplawski", "Jay Timpani",
"James Wantuck", "Benjamin Kuula", "Stein Kruse", "Duke Moscrip", "Jacob Bond",
"Chris Petersen","Mark Hagerty", "Kevin Cipoletti", "Shane Kim", "Michael Fuller",
"Taylor Saari", "Joe Deasy", "Andy Sather", "Perry Satterlee", "Larry Steele",
"Ron Bayley", "Mike McKay", "Mark Freeborn", "Michael de Broglio"
]


# 2. Compute net‐stats for every player (same logic as compare_net_stats_individual_vs_tournament)
#    Make sure this runs in the same notebook where `players` is already defined.
min_rounds = 3
net_stats = {}

for player in players.values():
    # Collect individual (non‐tournament) net scores
    indiv_nets = [
        r.net for r in player.rounds
        if (not r.tournament_flag) and r.completed and isinstance(r.net, (int, float))
    ]
    # Collect tournament net scores
    tour_nets = [
        r.net for r in player.rounds
        if r.tournament_flag and r.completed and isinstance(r.net, (int, float))
    ]

    if len(indiv_nets) >= min_rounds and len(tour_nets) >= min_rounds:
        avg_indiv = np.mean(indiv_nets)
        avg_tour = np.mean(tour_nets)
        diff = avg_indiv - avg_tour
        net_stats[player.name] = {
            "avg_tour": avg_tour,
            "avg_indiv": avg_indiv,
            "diff": diff,
            "tour_rounds": len(tour_nets),
            "indiv_rounds": len(indiv_nets)
        }
    else:
        # If not enough rounds, store NaN or counts
        net_stats[player.name] = {
            "avg_tour": np.nan,
            "avg_indiv": np.nan,
            "diff": np.nan,
            "tour_rounds": len(tour_nets),
            "indiv_rounds": len(indiv_nets)
        }

# 3. Identify which of the provided names actually exist in the `players` dictionary:
matched_names = [n for n in names if n in players]

# 4. Retrieve those Player objects:
matched_players = [players[n] for n in matched_names]

# 5. Build a DataFrame that includes:
#      • Name
#      • sand_bag_factor
#      • avg_tour  (from net_stats)
#      • avg_indiv (from net_stats)
#      • diff      (avg_indiv - avg_tour)
#      • tour_rounds
#      • indiv_rounds
records = []
for p in matched_players:
    sb_factor = p.sand_bag_factor if p.sand_bag_factor is not None else 0
    stats = net_stats.get(p.name, {})
    records.append({
        "Name": p.name,
        "sand_bag_factor": sb_factor,
        "avg_tour": stats.get("avg_tour", np.nan),
        "avg_indiv": stats.get("avg_indiv", np.nan),
        "diff": stats.get("diff", np.nan),
        "tour_rounds": stats.get("tour_rounds", 0),
        "indiv_rounds": stats.get("indiv_rounds", 0),
    })

df_matches = pd.DataFrame(records)

# 6. Sort by sand_bag_factor descending (so highest sandbaggers appear first):
df_sorted = df_matches.sort_values(by="sand_bag_factor", ascending=True).reset_index(drop=True)

# 7. Display the top results (adjust `head(20)` as needed to see more rows):

#tools.display_dataframe_to_user(
 #   "Matched Players with sand_bag_factor and Net Stats", 
 #   df_sorted.head(20)
#)
# Then display with scrolling:
from IPython.display import display, HTML

def show_scrollable_df(df, max_height=400):
    html = (
        f'<div style="height:{max_height}px; overflow:auto; '
        'border:1px solid #ccc; padding:5px; margin-bottom:10px;">'
        f'{df.to_html(index=False)}'
        '</div>'
    )
    display(HTML(html))

show_scrollable_df(df_sorted.head(100), max_height=500)


In [None]:
# Jupyter Notebook Cell: Filter by Minimum Rounds & Export for Text‐Editor

import pandas as pd
import numpy as np
from IPython.display import display, HTML

# --- Filter out players with fewer than min_rounds ---
df_filtered = df_sorted[
    (df_sorted["tour_rounds"]  >= min_rounds) &
    (df_sorted["indiv_rounds"] >= min_rounds)
].reset_index(drop=True)

# --- Scrollable display ---
def show_scrollable_df(df, max_height=400):
    html = (
        f'<div style="height:{max_height}px; overflow:auto; '
        'border:1px solid #ccc; padding:5px; margin-bottom:10px;">'
        f'{df.to_html(index=False)}'
        '</div>'
    )
    display(HTML(html))

show_scrollable_df(df_filtered.head(100), max_height=500)

# --- Export to CSV (or TSV) for loading in a text editor ---
output_csv = "/home/justin/JustInternetAI/Calcutta/matched_players_filtered.csv"
df_filtered.to_csv(output_csv, index=False)
print(f"✅ Filtered results ({len(df_filtered)} rows) saved to: {output_csv}")

In [None]:
# Jupyter Notebook Cell: Find how many names from your list are not in the `players` dictionary





# 2. Assuming `players` is your existing dictionary of Player objects:
#    players = { "Eric Browne": Player(...), ... }

# 3. Compute which names are missing in the players dictionary:
missing_names = [n for n in names if n not in players]

# 4. Print the count and list of missing names:
print(f"Number of names from your list not in the dataset: {len(missing_names)}\n")
print("Missing names:")
for name in missing_names:
    print(f" • {name}")
    matches = get_close_player_matches(name, players)
    if matches:
        print("🔍 Close matches:")
        for score, name, player in matches:
            print(f"  {name} (score={score:.2f}) — {player.name}")
    else:
        print("❌ No close matches found.")

In [None]:

# 1. Define the list of names (same as before):
names = [
    "Eric Browne", "Jeff Eulberg", "Jim Foody", "Nicholas Oki", "Jeff Johnson",
    "Dave McAughan", "Tony Johnson", "Jim Bennett", "Blake Knox", "Brad Thoreson",
    "Rick Barnett", "Greg Richards", "Jon Reingold", "Matt Tabler", "Patrick Angelet",
    "Mark Laura", "Kevin Oakes", "Joe Lynch", "Mark Bath", "Tom Peters",
    "Brandon Odom", "Johnathon Laura", "Lance Rounds", "Colin Mooney", "Damon Nasman",
    "Alexander Oki", "Augie Johnson", "Craig Bevan", "Mike Crosetto", "Steven McKean",
    "Kent Fisher", "Gary Furukawa", "Ike Lee", "James Ryder", "Bruce Burks",
    "Will Nelson", "Jim Watson", "John Andrews", "Jim Ridgeway", "Joe Razore",
    "Mike Angiulo", "Omar Mawjee", "John Harnish", "Brock Meitl", "John Culver",
    "Wyatt Hay", "Tim O'Keefe", "Alexander Kruse", "Tom Weeks", "Chris Taylor",
    "Ray Szylko", "Matt Yerbic", "Jack Bumgardner", "Dave West", "Michael Heijer",
    "Ben Johnson", "Mike Weiss", "Steven Petrie", "Brad Scott", "Yusuf Mehdi",
    "Robin Easton", "John Bentz", "Bob Whitsitt", "Jason Kono", "Peter Woodward",
    "Nick Rensch", "Jason Furr", "Cam Warren", "Al Clise", "Hal Wright",
    "Justin Madison", "Todd Parsons", "Bill Reller", "Bijal Shah", "William Richter",
    "Tim Parker", "Brad Pratt", "Jimmy Lake", "Kumar Mehta", "Scott Whittlesey",
    "Jeff Roberts", "Mitch Mounger", "Andrew Moore", "Blake Bentz", "Sean Whittlesitt",
    "Isaac Hammer", "Matt Nieman", "Christian Gerron", 
    "Ray Sato", "Larry Meitl", "Tracy Quickstad", "Craig Suhrbier", "Greg McNabb",
    "Steve Savard", "Rod Olson", "Karlen Rothenbueler", "Pat Chun", "Robbie Bach",
    "Peter Faricy", "JJ Sato", "Mark Pigott", "John Murray", "Tim Tasker",
    "Carl Lovsted", "Robin Sloane", "Jack Savard", "Steve Poore", "Ryan Evans",
    "Cole Thompson", "Jamie Nordstrom", "Paul Nicely", "Peter Bryant", "Michael Crowson",
    "Bob Manlowe", "Seth Patton", "Mike Wiesmann", "Chris Steffanci", "Kevin Hughes",
    "Marin Guillet", "Tim Rattigan", "Dave Paisley", "John Thayer", "Rick Applegate",
    "Darren Alger", "Jim Freer", "Rob Schwartz", "Matt Nickerson", "Greg Komen",
    "Andy Tonkin", "Connor McClain", "Michael Raine", "Tony Bacon", "Nick Malshuk",
    "Roger Kuula", "Mark Tsang", "Frank Fite", "Jim Neal", "Brian Biege",
    "Damon Huard", "Henry Albrecht", "Bob Nieman", "Alex Smith", "Lew Thayer",
    "Kaine Kornegay", "Todd Marker", "Stan Freimuth", "Satoshi Nakajima", "Fred Devereux",
    "Steven Goldfarb", "Robby Tonkin", "Colin Dresser", "Dean Poplawski", "Jay Timpani",
    "James Wantuck", "BJ Kuula", "Stein Kruse", "Duke Moscrip", "Jake Bond",
    "Chris Petersen","Mark Hagerty", "Kevin Cipoletti", "Shane Kim", "Michael Fuller",
    "Taylor Saari", "Joe Deasy", "Andy Sather", "Perry Satterlee", "Larry Steele",
    "Ron Bayley", "Mike McKay", "Mark Freeborn", "Michael de Broglio"
]

In [None]:

from PIL import Image
import pytesseract
import re

# 2) Load your existing names list (or import it if it lives in another module)
names = [
    "Eric Browne", "Jeff Eulberg", "Jimbo Foody", "Nicholas Oki", "Jeff Johnson",
    "Dave McAughan", "Tony Johnson", "Jim Bennett", "Blake Knox", "Brad Thoreson",
    "Rick Barnett", "Greg Richards", "Jon Reingold", "Matt Tabler", "Patrick Angelet",
    "Mark Laura", "Kevin Oakes", "Joe Lynch", "Mark Bath", "Tom Peters",
    "Brandon Odom", "Johnathon Laura", "Lance Rounds", "Colin Mooney", "Damon Nasman",
    "Alexander Oki", "Augie Johnson", "Craig Bevan", "Mike Crosetto", "Steven McKean",
    "Kent Fisher", "Gary Furukawa", "Ike Lee", "Jim Ryder", "Bruce Burks",
    "Will Nelson", "Jim Watson", "John Andrews", "Jim Ridgeway", "Joe Razore",
    "Mike Angiulo", "Omar Mawjee", "John Harnish", "Brock Meitl", "John Culver",
    "Wyatt Hay", "Tim O'Keefe", "Alexander Kruse", "Tom Weeks", "Chris Taylor",
    "Ray Szylko", "Matt Yerbic", "Jack Bumgardner", "Dave West", "Michael Heijer",
    "Ben Johnson", "Mike Weiss", "Steven Petrie", "Brad Scott", "Yusuf Mehdi",
    "Robin Easton", "John Bentz", "Bob Whitsitt", "Jason Kono", "Peter Woodward",
    "Nick Rensch", "Jason Furr", "Cam Warren", "Al Clise", "Hal Wright",
    "Justin Madison", "Todd Parsons", "Bill Reller", "Bijal Shah", "Bill Richter",
    "Tim Parker", "Brad Pratt", "Jimmy Lake", "Kumar Mehta", "Scott Whittlesey",
    "Jeff Roberts", "Mitch Mounger", "Andrew Moore", "Blake Bentz", "Sean Whittlesitt",
    "Isaac Hammer", "Matt Nieman", "Christian Gerron"
]

# 3) Open the image and run OCR
img = Image.open("/home/justin/JustInternetAI/Calcutta/Data/DirtyData/MM2ndtee.png")    # adjust path if needed
raw = pytesseract.image_to_string(img)

# 4) Split into lines and clean
lines = [l.strip() for l in raw.splitlines() if l.strip()]

# 5) Drop any header/footer junk (e.g. “MM 2nd tee”) by filtering for actual “First Last” patterns:
name_pattern = re.compile(r"^[A-Z][a-z']+(?: [A-Z][a-z']+)+$")  
new_names = [l for l in lines if name_pattern.match(l)]

# 6) Determine which ones are truly new before extending
to_add = [n for n in new_names if n not in names]

# 7) Append
names.extend(to_add)

# 8) Report results
print(f"Found {len(new_names)} names in image, {len(to_add)} new.")
print("Newly added:")
for n in to_add:
    print(" •", n)

In [None]:
# Jupyter Notebook Cell: OCR + Debug + Append New Names

from PIL import Image
import pytesseract
import re

# 1) Load the image (adjust path if needed)
img = Image.open("/home/justin/JustInternetAI/Calcutta/Data/DirtyData/MM2ndtee.png")

# 2) Run OCR
raw_text = pytesseract.image_to_string(img)

# 3) Split into non‐empty lines
lines = [line.strip() for line in raw_text.splitlines() if line.strip()]

# 4) Show all OCR lines
print("🔍 All OCR lines:")
for line in lines:
    print(f"" {line} ,"")

# 5) Regex for “First Last” style names (allows apostrophes, hyphens)
pattern = re.compile(r"^[A-Z][a-zA-Z'’\-]+(?: [A-Z][a-zA-Z'’\-]+)+$")

# 6) Split into matched vs dropped
matched = [line for line in lines if pattern.match(line)]
dropped = [line for line in lines if not pattern.match(line)]

# 7) Print matched and dropped
print(f"\n✅ Matched names ({len(matched)}):")
for name in matched:
    print(f" ," , name)

print(f"\n❌ Dropped lines ({len(dropped)}):")
for item in dropped:
    print(" ", item)

# 8) Identify which matched names are new to your existing `names` list
to_add = [name for name in matched if name not in names]

# 9) Append and report
names.extend(to_add)
print(f"\n➕ Added {len(to_add)} new names:")
for name in to_add:
    print(" ,", name)

In [None]:
# Jupyter Notebook Cell: OCR + Print Matched & Dropped Names in Quotes (5 per Line)

from PIL import Image
import pytesseract
import re

# 1) Load and OCR the image
img = Image.open("/home/justin/JustInternetAI/Calcutta/Data/DirtyData/MM2ndtee.png")
raw_text = pytesseract.image_to_string(img)

# 2) Split into non-empty lines
lines = [line.strip() for line in raw_text.splitlines() if line.strip()]

# 3) Regex for “First Last” style names
pattern = re.compile(r"^[A-Z][a-zA-Z'’\\-]+(?: [A-Z][a-zA-Z'’\\-]+)+$")

# 4) Separate matched vs dropped
matched = [line for line in lines if pattern.match(line)]
dropped = [line for line in lines if not pattern.match(line)]

# 5) Print matched names in quotes, 5 per line
print("✅ Matched names:")
for i in range(0, len(matched), 5):
    chunk = matched[i:i+5]
    print(" ".join(f'"{name}",' for name in chunk))

# 6) Print dropped lines in quotes, 5 per line
print("\n❌ Dropped lines:")
for i in range(0, len(dropped), 5):
    chunk = dropped[i:i+5]
    print(" ".join(f'"{item}",' for item in chunk))