In [14]:
import os
import gc
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import brier_score_loss
from sklearn.calibration import CalibratedClassifierCV
import warnings
warnings.filterwarnings('ignore')

# Set GPU acceleration for XGBoost and LightGBM
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

def log_time(func):
    """Decorator to log function execution time"""
    def wrapper(*args, **kwargs):
        start = time.time()
        print(f"Starting {func.__name__}...")
        result = func(*args, **kwargs)
        elapsed = time.time() - start
        print(f"Completed {func.__name__} in {elapsed:.2f} seconds")
        return result
    return wrapper

@log_time
def train_cv_model(X, y, model_type='xgb', n_folds=5):
    """Train a model with cross-validation for better generalization"""
    # Ensure numeric data types
    X = X.astype('float32')  # Use float32 to reduce memory usage
    
    # Initialize cross-validation
    kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    
    # Model parameters
    if model_type == 'xgb':
        # For newer XGBoost versions (>= 1.6.0)
        params = {
            'objective': 'binary:logistic',
            'eval_metric': 'logloss',
            'tree_method': 'gpu_hist',  # GPU acceleration
            'predictor': 'gpu_predictor',  # GPU acceleration
            'learning_rate': 0.05,
            'max_depth': 5,
            'n_estimators': 200,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'random_state': 42,
            # Early stopping is now here for newer XGBoost
            'early_stopping_rounds': 20
        }
    elif model_type == 'lgb':
        params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'device': 'gpu',  # GPU acceleration
            'learning_rate': 0.05,
            'max_depth': 5,
            'n_estimators': 200,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'random_state': 42
        }
    
    # Train models with cross-validation
    models = []
    fold_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        print(f"Training fold {fold+1}/{n_folds}")
        
        try:
            if model_type == 'xgb':
                # Try with newer XGBoost API
                model = xgb.XGBClassifier(**params)
                # Handle both newer and older XGBoost versions
                try:
                    model.fit(
                        X_train, y_train,
                        eval_set=[(X_val, y_val)],
                        verbose=100
                    )
                except TypeError:
                    # For older XGBoost versions that expect early_stopping_rounds in fit()
                    old_params = params.copy()
                    if 'early_stopping_rounds' in old_params:
                        early_stopping = old_params.pop('early_stopping_rounds')
                        model = xgb.XGBClassifier(**old_params)
                        model.fit(
                            X_train, y_train,
                            eval_set=[(X_val, y_val)],
                            early_stopping_rounds=early_stopping,
                            verbose=100
                        )
                    
            elif model_type == 'lgb':
                model = lgb.LGBMClassifier(**params)
                
                # Try different approaches for LightGBM based on version
                try:
                    # Try with callbacks approach (newer LightGBM)
                    callbacks = [lgb.early_stopping(20)]
                    model.fit(
                        X_train, y_train,
                        eval_set=[(X_val, y_val)],
                        callbacks=callbacks,
                        verbose=100
                    )
                except (TypeError, AttributeError):
                    try:
                        # Try standard approach with early_stopping_rounds
                        model.fit(
                            X_train, y_train,
                            eval_set=[(X_val, y_val)],
                            early_stopping_rounds=20,
                            verbose=100
                        )
                    except TypeError:
                        # Last resort: just fit without early stopping
                        print("Fitting LightGBM without early stopping")
                        model.fit(
                            X_train, y_train,
                            eval_set=[(X_val, y_val)],
                            verbose=100
                        )
                
            # Calibrate probabilities
            calibrated_model = CalibratedClassifierCV(model, cv='prefit')
            calibrated_model.fit(X_val, y_val)
            
            # Evaluate
            y_pred = calibrated_model.predict_proba(X_val)[:, 1]
            brier = brier_score_loss(y_val, y_pred)
            print(f"Fold {fold+1} Brier Score: {brier:.6f}")
            
            models.append(calibrated_model)
            fold_scores.append(brier)
            
        except Exception as e:
            print(f"Error in fold {fold+1}: {e}")
            print("Continuing with next fold...")
            continue
    
    if not models:
        raise Exception("All model training attempts failed. Check your data and parameters.")
    
    print(f"Mean Brier Score: {np.mean(fold_scores):.6f}")
    
    # Return both the list of models and their scores
    return models, fold_scores

def main():
    """Main function to restart training with only XGBoost"""
    start_time = time.time()
    print("Restarting March Madness prediction with XGBoost only")
    
    # Configure paths - adjust this to your data location
    base_dir = "./data/"
    output_dir = "./"
    
    try:
        # Load the pre-processed team stats files
        print("\n---------- Loading Pre-processed Stats ----------")
        m_team_stats = pd.read_csv(f"{output_dir}m_team_stats.csv")
        print(f"Loaded men's team stats with {len(m_team_stats)} rows")
        
        w_team_stats = pd.read_csv(f"{output_dir}w_team_stats.csv")
        print(f"Loaded women's team stats with {len(w_team_stats)} rows")
        
        # Load minimal data needed for training
        print("Loading minimal data files needed for prediction...")
        data = {}
        files = {
            "m_reg_results": f"{base_dir}MRegularSeasonCompactResults.csv",
            "w_reg_results": f"{base_dir}WRegularSeasonCompactResults.csv",
            "m_tourney_results": f"{base_dir}MNCAATourneyCompactResults.csv",
            "w_tourney_results": f"{base_dir}WNCAATourneyCompactResults.csv"
        }
        
        # Define datatypes to reduce memory usage
        dtypes = {
            'Season': 'int16',
            'DayNum': 'int16',
            'WTeamID': 'int16',
            'LTeamID': 'int16',
            'WScore': 'int16',
            'LScore': 'int16',
            'NumOT': 'int8'
        }
        
        for name, file_path in files.items():
            try:
                print(f"Loading {name}...")
                data[name] = pd.read_csv(file_path, dtype=dtypes)
            except Exception as e:
                print(f"Error loading {name}: {e}")
                data[name] = None
        
        # Create training data for men
        print("\n---------- Training Men's Model ----------")
        # Combine regular season and tournament results
        m_all_results = pd.concat([data['m_reg_results'], data['m_tourney_results']])
        
        # Create matchup features only for seasons where we have results (to save time)
        train_seasons = m_all_results['Season'].unique()
        m_train_stats = m_team_stats[m_team_stats['Season'].isin(train_seasons)]
        
        # Try to load pre-computed matchups if available
        try:
            print("Checking for pre-computed men's matchups...")
            m_matchups = pd.read_csv(f"{output_dir}m_matchups.csv")
            print(f"Loaded pre-computed men's matchups with {len(m_matchups)} rows")
        except FileNotFoundError:
            print("Creating matchup features for training...")
            
            # Create features for each season
            seasons = sorted(m_train_stats['Season'].unique())
            all_matchups = []
            
            for season in seasons:
                print(f"Creating matchups for season {season}")
                season_stats = m_train_stats[m_train_stats['Season'] == season]
                
                # Get all teams in the season
                teams = season_stats['TeamID'].unique()
                total_matchups = len(teams) * (len(teams) - 1) // 2
                print(f"Creating {total_matchups} matchups for {len(teams)} teams")
                
                matchups = []
                for i, team1_id in enumerate(teams):
                    for team2_id in teams[i+1:]:
                        team1_stats = season_stats[season_stats['TeamID'] == team1_id].iloc[0]
                        team2_stats = season_stats[season_stats['TeamID'] == team2_id].iloc[0]
                        
                        # Calculate differential features
                        matchup_features = {
                            'Season': season,
                            'Team1ID': team1_id,
                            'Team2ID': team2_id
                        }
                        
                        # Basic stat differences (exclude non-numeric and ID columns)
                        exclude_cols = ['Season', 'TeamID', 'ConfAbbrev']
                        numeric_cols = [col for col in team1_stats.index 
                                      if col not in exclude_cols and isinstance(team1_stats[col], (int, float))]
                        
                        for col in numeric_cols:
                            matchup_features[f'{col}_1'] = team1_stats[col]
                            matchup_features[f'{col}_2'] = team2_stats[col]
                            matchup_features[f'{col}_diff'] = team1_stats[col] - team2_stats[col]
                        
                        matchups.append(matchup_features)
                    
                    # Free memory regularly
                    if i % 100 == 0 and i > 0:
                        gc.collect()
                
                all_matchups.extend(matchups)
                
                # Clean up to save memory
                del matchups, season_stats
                gc.collect()
            
            m_matchups = pd.DataFrame(all_matchups)
            
            # Save to avoid recomputing
            m_matchups.to_csv(f"{output_dir}m_matchups.csv", index=False)
            print(f"Saved men's matchups to {output_dir}m_matchups.csv")
        
        print("Preparing men's training data...")
        
        # Get game results
        game_results = []
        for _, row in m_all_results.iterrows():
            season = row['Season']
            w_team = row['WTeamID']
            l_team = row['LTeamID']
            
            # Ensure Team1 is always the lower ID
            team1 = min(w_team, l_team)
            team2 = max(w_team, l_team)
            
            game_results.append({
                'Season': season,
                'Team1ID': team1,
                'Team2ID': team2,
                'Team1Win': 1 if team1 == w_team else 0
            })
        
        game_results_df = pd.DataFrame(game_results)
        
        # Merge game results with matchup features
        print("Merging game results with features...")
        m_training_data = pd.merge(
            m_matchups,
            game_results_df,
            on=['Season', 'Team1ID', 'Team2ID'],
            how='inner'
        )
        
        # Split features and target
        m_X = m_training_data.drop(['Season', 'Team1ID', 'Team2ID', 'Team1Win'], axis=1)
        m_y = m_training_data['Team1Win']
        
        # Handle NaN values
        m_X = m_X.fillna(0)
        
        print(f"Training data shape: {m_X.shape}")
        
        # Train men's XGBoost models
        m_xgb_models, m_xgb_scores = train_cv_model(m_X, m_y, model_type='xgb')
        
        # Try LightGBM training with modified code
        try:
            print("\nAttempting LightGBM training with modified code...")
            m_lgb_models, m_lgb_scores = train_cv_model(m_X, m_y, model_type='lgb')
            
            # Create men's ensemble if both models worked
            m_ensemble = create_ensemble(m_xgb_models, m_lgb_models, m_xgb_scores, m_lgb_scores)
            print("Successfully created ensemble model with XGBoost and LightGBM")
        except Exception as e:
            print(f"LightGBM training failed: {e}")
            print("Proceeding with XGBoost model only")
            
            # If LightGBM fails, use only XGBoost predictions
            def m_ensemble(X_pred):
                X_pred = X_pred.fillna(0)
                X_pred = X_pred.astype('float32')
                
                # Average predictions from all XGBoost models
                xgb_preds = np.zeros(len(X_pred))
                for model in m_xgb_models:
                    xgb_preds += model.predict_proba(X_pred)[:, 1]
                
                return xgb_preds / len(m_xgb_models)
        
        # Generate men's predictions
        print("\n---------- Generating Men's Predictions ----------")
        m_season_stats = m_team_stats[m_team_stats['Season'] == 2025]
        
        # Create matchup features for 2025
        print("Creating matchup features for 2025...")
        teams = m_season_stats['TeamID'].unique()
        total_matchups = len(teams) * (len(teams) - 1) // 2
        print(f"Creating {total_matchups} matchups for {len(teams)} teams")
        
        m_2025_matchups = []
        for i, team1_id in enumerate(teams):
            for team2_id in teams[i+1:]:
                team1_stats = m_season_stats[m_season_stats['TeamID'] == team1_id].iloc[0]
                team2_stats = m_season_stats[m_season_stats['TeamID'] == team2_id].iloc[0]
                
                # Calculate differential features
                matchup_features = {
                    'Season': 2025,
                    'Team1ID': team1_id,
                    'Team2ID': team2_id
                }
                
                # Basic stat differences (exclude non-numeric and ID columns)
                exclude_cols = ['Season', 'TeamID', 'ConfAbbrev']
                numeric_cols = [col for col in team1_stats.index 
                               if col not in exclude_cols and isinstance(team1_stats[col], (int, float))]
                
                for col in numeric_cols:
                    matchup_features[f'{col}_1'] = team1_stats[col]
                    matchup_features[f'{col}_2'] = team2_stats[col]
                    matchup_features[f'{col}_diff'] = team1_stats[col] - team2_stats[col]
                
                m_2025_matchups.append(matchup_features)
            
            # Free memory regularly
            if i % 100 == 0 and i > 0:
                gc.collect()
        
        m_2025_matchups_df = pd.DataFrame(m_2025_matchups)
        
        # Prepare features for prediction
        X_pred = m_2025_matchups_df.drop(['Season', 'Team1ID', 'Team2ID'], axis=1)
        X_pred = X_pred.fillna(0)
        
        # Make predictions in batches to manage memory
        batch_size = 10000
        predictions = []
        total_batches = (len(X_pred) + batch_size - 1) // batch_size
        
        for i in range(0, len(X_pred), batch_size):
            batch_end = min(i + batch_size, len(X_pred))
            print(f"Processing batch {i//batch_size + 1}/{total_batches}")
            
            batch = X_pred.iloc[i:batch_end]
            batch_preds = m_ensemble(batch)
            predictions.extend(batch_preds)
        
        # Create submission format
        m_predictions = pd.DataFrame({
            'ID': m_2025_matchups_df.apply(
                lambda x: f"{x['Season']}_{x['Team1ID']}_{x['Team2ID']}", axis=1
            ),
            'Pred': predictions
        })
        
        # Generate women's predictions using XGBoost only
        print("\n---------- Training Women's Model & Generating Predictions ----------")
        # Create matchup features only for 2025 (for submission)
        w_season_stats = w_team_stats[w_team_stats['Season'] == 2025]
        
        # Create matchup features for 2025
        print("Creating matchup features for 2025...")
        teams = w_season_stats['TeamID'].unique()
        total_matchups = len(teams) * (len(teams) - 1) // 2
        print(f"Creating {total_matchups} matchups for {len(teams)} teams")
        
        w_2025_matchups = []
        for i, team1_id in enumerate(teams):
            for team2_id in teams[i+1:]:
                team1_stats = w_season_stats[w_season_stats['TeamID'] == team1_id].iloc[0]
                team2_stats = w_season_stats[w_season_stats['TeamID'] == team2_id].iloc[0]
                
                # Calculate differential features
                matchup_features = {
                    'Season': 2025,
                    'Team1ID': team1_id,
                    'Team2ID': team2_id
                }
                
                # Basic stat differences (exclude non-numeric and ID columns)
                exclude_cols = ['Season', 'TeamID', 'ConfAbbrev']
                numeric_cols = [col for col in team1_stats.index 
                               if col not in exclude_cols and isinstance(team1_stats[col], (int, float))]
                
                for col in numeric_cols:
                    matchup_features[f'{col}_1'] = team1_stats[col]
                    matchup_features[f'{col}_2'] = team2_stats[col]
                    matchup_features[f'{col}_diff'] = team1_stats[col] - team2_stats[col]
                
                w_2025_matchups.append(matchup_features)
            
            # Free memory regularly
            if i % 100 == 0 and i > 0:
                gc.collect()
        
        w_2025_matchups_df = pd.DataFrame(w_2025_matchups)
        
        # Use a constant prediction of 0.5 for women's games (baseline)
        # You can improve this later by training a model on women's data
        w_predictions = pd.DataFrame({
            'ID': w_2025_matchups_df.apply(
                lambda x: f"{x['Season']}_{x['Team1ID']}_{x['Team2ID']}", axis=1
            ),
            'Pred': [0.5] * len(w_2025_matchups_df)
        })
        
        # Combine and save predictions
        all_predictions = pd.concat([m_predictions, w_predictions])
        all_predictions.to_csv(f"{output_dir}submission.csv", index=False)
        
        print(f"\nPredictions saved to {output_dir}submission.csv")
        print(f"Total execution time: {(time.time() - start_time) / 60:.2f} minutes")
        
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        import traceback
        traceback.print_exc()
        print("Please check your data and try again.")

def create_ensemble(xgb_models, lgb_models, xgb_scores, lgb_scores):
    """Create a weighted ensemble of multiple models"""
    # Convert scores to weights (lower score = better model = higher weight)
    def scores_to_weights(scores):
        scores = np.array(scores)
        # Invert scores (lower is better) and normalize
        weights = 1 / (scores + 1e-10)  # Add small epsilon to avoid division by zero
        return weights / weights.sum()
    
    xgb_weights = scores_to_weights(xgb_scores)
    lgb_weights = scores_to_weights(lgb_scores)
    
    # Normalize weights between model types
    xgb_mean_score = np.mean(xgb_scores)
    lgb_mean_score = np.mean(lgb_scores)
    
    total_weight = 1 / (xgb_mean_score + 1e-10) + 1 / (lgb_mean_score + 1e-10)
    xgb_overall_weight = (1 / (xgb_mean_score + 1e-10)) / total_weight
    lgb_overall_weight = (1 / (lgb_mean_score + 1e-10)) / total_weight
    
    print(f"Ensemble weights - XGB: {xgb_overall_weight:.2f}, LGB: {lgb_overall_weight:.2f}")
    
    # Create ensemble prediction function
    def ensemble_predict(X_pred):
        # Handle NaN values
        X_pred = X_pred.fillna(0)
        X_pred = X_pred.astype('float32')
        
        # Get predictions from each model
        xgb_preds = np.zeros(len(X_pred))
        for i, model in enumerate(xgb_models):
            xgb_preds += xgb_weights[i] * model.predict_proba(X_pred)[:, 1]
        
        lgb_preds = np.zeros(len(X_pred))
        for i, model in enumerate(lgb_models):
            lgb_preds += lgb_weights[i] * model.predict_proba(X_pred)[:, 1]
        
        # Weighted ensemble
        return xgb_overall_weight * xgb_preds + lgb_overall_weight * lgb_preds
    
    return ensemble_predict

if __name__ == "__main__":
    main()

Restarting March Madness prediction with XGBoost only

---------- Loading Pre-processed Stats ----------
Loaded men's team stats with 13388 rows
Loaded women's team stats with 9488 rows
Loading minimal data files needed for prediction...
Loading m_reg_results...
Loading w_reg_results...
Loading m_tourney_results...
Loading w_tourney_results...

---------- Training Men's Model ----------
Checking for pre-computed men's matchups...
Creating matchup features for training...
Creating matchups for season 1985
Creating 39621 matchups for 282 teams
Creating matchups for season 1986
Creating 39903 matchups for 283 teams
Creating matchups for season 1987
Creating 41905 matchups for 290 teams
Creating matchups for season 1988
Creating 41905 matchups for 290 teams
Creating matchups for season 1989
Creating 42778 matchups for 293 teams
Creating matchups for season 1990
Creating 42486 matchups for 292 teams
Creating matchups for season 1991
Creating 43365 matchups for 295 teams
Creating matchups fo

In [17]:
import pandas as pd
import numpy as np

# Load predictions
predictions = pd.read_csv('submission.csv')

# Print a sample to understand the format
print("Sample of the submission file:")
print(predictions.head())

# Extract parts more safely
print("\nExtracting ID parts...")
# First, let's look at what we're dealing with
print("Sample IDs:", predictions['ID'].head().tolist())

# Split the ID string and handle potential decimal points
id_parts = predictions['ID'].str.split('_', expand=True)
predictions['Season'] = id_parts[0].str.replace('.0', '').astype(str)
predictions['Team1ID'] = pd.to_numeric(id_parts[1].str.replace('.0', ''))
predictions['Team2ID'] = pd.to_numeric(id_parts[2].str.replace('.0', ''))

print("\nAfter extraction:")
print(predictions[['ID', 'Season', 'Team1ID', 'Team2ID']].head())

# Show raw predictions
print("\nTop 10 most confident predictions:")
predictions['Confidence'] = predictions['Pred'].apply(lambda x: max(x, 1-x))
sorted_preds = predictions[['ID', 'Pred', 'Team1ID', 'Team2ID', 'Confidence']].sort_values('Confidence', ascending=False)
print(sorted_preds.head(10))

# If you want to try loading team names again
try:
    print("\nAttempting to load team names...")
    # Try different potential paths
    paths = [
        './data/MTeams.csv',
        'MTeams.csv',
        '../data/MTeams.csv',
        'data/MTeams.csv'
    ]
    
    m_teams_loaded = False
    w_teams_loaded = False
    
    for path in paths:
        try:
            base_path = path.replace('MTeams.csv', '')
            m_teams = pd.read_csv(f"{base_path}MTeams.csv")
            w_teams = pd.read_csv(f"{base_path}WTeams.csv")
            m_teams_loaded = True
            w_teams_loaded = True
            print(f"Found teams files in {base_path}")
            break
        except FileNotFoundError:
            continue
    
    if m_teams_loaded and w_teams_loaded:
        # Continue with team name matching
        teams = pd.concat([m_teams, w_teams])
        print(f"Loaded {len(teams)} teams")
        
        # Add team names
        predictions = predictions.merge(teams[['TeamID', 'TeamName']], 
                                     left_on='Team1ID', right_on='TeamID', 
                                     how='left').drop('TeamID', axis=1)
        predictions = predictions.merge(teams[['TeamID', 'TeamName']], 
                                     left_on='Team2ID', right_on='TeamID', 
                                     how='left').drop('TeamID', axis=1)
        predictions.rename(columns={'TeamName_x': 'Team1Name', 'TeamName_y': 'Team2Name'}, inplace=True)
        
        # Add predicted winner
        predictions['PredictedWinner'] = predictions.apply(
            lambda row: row['Team1Name'] if row['Pred'] > 0.5 else row['Team2Name'], axis=1
        )
        predictions['WinProbability'] = predictions.apply(
            lambda row: row['Pred'] if row['Pred'] > 0.5 else 1 - row['Pred'], axis=1
        )
        
        print("\nSample with team names:")
        print(predictions[['Team1Name', 'Team2Name', 'PredictedWinner', 'WinProbability']].head(10))
    else:
        print("Could not find team files. Displaying IDs only.")
except Exception as e:
    print(f"Error loading team names: {e}")
    print("Continuing with team IDs only")

# Print most predictable winners
print("\nMost confident predictions:")
print("Team1ID beats Team2ID with probability Pred")
for _, row in sorted_preds.head(20).iterrows():
    team1, team2 = int(row['Team1ID']), int(row['Team2ID'])
    prob = row['Pred']
    winner = f"Team {team1}" if prob > 0.5 else f"Team {team2}"
    win_prob = prob if prob > 0.5 else 1-prob
    print(f"{winner} wins: {team1} vs {team2} with {win_prob:.4f} probability")

Sample of the submission file:
                     ID      Pred
0  2025.0_1101.0_1102.0  0.775488
1  2025.0_1101.0_1103.0  0.098356
2  2025.0_1101.0_1104.0  0.067566
3  2025.0_1101.0_1105.0  0.886494
4  2025.0_1101.0_1106.0  0.631665

Extracting ID parts...
Sample IDs: ['2025.0_1101.0_1102.0', '2025.0_1101.0_1103.0', '2025.0_1101.0_1104.0', '2025.0_1101.0_1105.0', '2025.0_1101.0_1106.0']

After extraction:
                     ID Season  Team1ID  Team2ID
0  2025.0_1101.0_1102.0   2025     1101     1102
1  2025.0_1101.0_1103.0   2025     1101     1103
2  2025.0_1101.0_1104.0   2025     1101     1104
3  2025.0_1101.0_1105.0   2025     1101     1105
4  2025.0_1101.0_1106.0   2025     1101     1106

Top 10 most confident predictions:
                         ID      Pred  Team1ID  Team2ID  Confidence
16215  2025.0_1154.0_1397.0  0.065607     1154     1397    0.934393
6080   2025.0_1120.0_1170.0  0.934357     1120     1170    0.934357
6212   2025.0_1120.0_1306.0  0.934350     1120     1306

In [18]:
# Count which teams win most often with high confidence
team_win_counts = {}
high_conf_threshold = 0.85  # Only count high confidence wins

for _, row in predictions.iterrows():
    team1 = row['Team1ID']
    team2 = row['Team2ID']
    pred = row['Pred']
    
    if pred > 0.5 and pred > high_conf_threshold:
        winner = team1
    elif pred < 0.5 and (1-pred) > high_conf_threshold:
        winner = team2
    else:
        continue  # Skip low confidence matchups
    
    if winner in team_win_counts:
        team_win_counts[winner] += 1
    else:
        team_win_counts[winner] = 1

# Convert to DataFrame and get top teams
win_counts_df = pd.DataFrame({
    'TeamID': list(team_win_counts.keys()),
    'HighConfWins': list(team_win_counts.values())
}).sort_values('HighConfWins', ascending=False)

# Get team names
top_teams = win_counts_df.merge(teams[['TeamID', 'TeamName']], on='TeamID', how='left')
print("\nTop teams (most likely tournament winners):")
print(top_teams.head(10))


Top teams (most likely tournament winners):
   TeamID  HighConfWins     TeamName
0    1120           351       Auburn
1    1196           337      Florida
2    1104           334      Alabama
3    1181           330         Duke
4    1222           325      Houston
5    1397           322    Tennessee
6    1246           316     Kentucky
7    1385           314    St John's
8    1277           311  Michigan St
9    1279           308  Mississippi
