<a href="https://colab.research.google.com/github/KyPython/March-Machine-Learning-Mania/blob/main/March_Machine_Learning_Mania.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Model training

Logistic Regression

In [8]:
from google.colab import drive
drive.mount('/content/drive') # No need to remount

import pandas as pd
import itertools
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split  # Import for data splitting


def predict_tournament_results(X_train, y_train, X_test, team_ids):
    """
    Predicts the results of the tournament based on the provided data and model.

    Args:
        X_train (pd.DataFrame): Training data features.
        y_train (pd.Series): Training data target.
        X_test (pd.DataFrame): Test data features.
        team_ids (list): List of team IDs.

    Returns:
        dict: A dictionary where keys are team IDs (int) and values are
              predicted probabilities (float) of winning against any other team.
    """

    # Create a pipeline with scaling and logistic regression
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('logistic', LogisticRegression(solver='liblinear', penalty='l2'))
    ])

    # Calibrate the model
    # Inside the predict_tournament_results function:
calibrated_model = CalibratedClassifierCV(pipeline, method='isotonic', cv=3) # Reduced cv to 3

    # Train the calibrated model
    calibrated_model.fit(X_train, y_train)

    # Predict probabilities for test data
    probabilities = calibrated_model.predict_proba(X_test)[:, 1]

    # Create a dictionary of team win probabilities
    team_win_probs = dict(zip(team_ids, probabilities))

    return team_win_probs


# Load data for both Men's and Women's tournaments
mens_regular_season_path = '/content/drive/MyDrive/March_Madness/MRegularSeasonDetailedResults.csv'
mens_data = pd.read_csv(mens_regular_season_path)

womens_regular_season_path = '/content/drive/MyDrive/March_Madness/WRegularSeasonDetailedResults.csv'
womens_data = pd.read_csv(womens_regular_season_path)

# --- Perform feature engineering and create X, y for Men's ---
# Feature Engineering for Men's
mens_data['ScoreDiff'] = mens_data['WScore'] - mens_data['LScore']  # Create a feature: score difference
X_men = mens_data[['ScoreDiff']]  # Use score difference as a feature
y_men = mens_data['WTeamID']  # Target variable: winning team ID
X_train_men, X_test_men, y_train_men, y_test_men = train_test_split(X_men, y_men, test_size=0.2, random_state=42)  # Split data
team_ids_men = mens_data['WTeamID'].unique()  # Get unique team IDs

# --- Perform feature engineering and create X, y for Women's ---
# Feature Engineering for Women's
womens_data['ScoreDiff'] = womens_data['WScore'] - womens_data['LScore']  # Create a feature: score difference
X_women = womens_data[['ScoreDiff']]  # Use score difference as a feature
y_women = womens_data['WTeamID']  # Target variable: winning team ID
X_train_women, X_test_women, y_train_women, y_test_women = train_test_split(X_women, y_women, test_size=0.2, random_state=42)  # Split data
team_ids_women = womens_data['WTeamID'].unique()  # Get unique team IDs

# Train models and get predictions
team_win_probs_men = predict_tournament_results(X_train_men, y_train_men, X_test_men, team_ids_men)
team_win_probs_women = predict_tournament_results(X_train_women, y_train_women, X_test_women, team_ids_women)


def create_submission_file_2025(predictions_dict_men, predictions_dict_women, output_file='submission.csv', season=2025):
    """
    Creates a submission file for the 2025 NCAA tournament prediction competition
    for both Men's and Women's tournaments.

    Args:
        predictions_dict_men (dict): Predictions for Men's games.
        predictions_dict_women (dict): Predictions for Women's games.
        output_file (str): Name of the output CSV file.
        season (int): The season year.
    """

    # Load MTeams.csv and WTeams.csv to get all team IDs
    mteams_path = '/content/drive/MyDrive/March_Madness/MTeams.csv'
    mteams_df = pd.read_csv(mteams_path)
    men_team_ids = mteams_df['TeamID'].unique()

    wteams_path = '/content/drive/MyDrive/March_Madness/WTeams.csv'
    wteams_df = pd.read_csv(wteams_path)
    women_team_ids = wteams_df['TeamID'].unique()

    # Generate all possible matchups for Men's and Women's
    men_matchups = list(itertools.combinations(men_team_ids, 2))
    women_matchups = list(itertools.combinations(women_team_ids, 2))

    # Create submission DataFrame
    submission_data = []

    # Add Men's predictions
    for team1, team2 in men_matchups:
        id_str = f"{season}_{min(team1, team2)}_{max(team1, team2)}"
        pred = predictions_dict_men.get(min(team1, team2), 0.5)
        submission_data.append([id_str, pred])

    # Add Women's predictions
    for team1, team2 in women_matchups:
        id_str = f"{season}_{min(team1, team2)}_{max(team1, team2)}"
        pred = predictions_dict_women.get(min(team1, team2), 0.5)
        submission_data.append([id_str, pred])

    submission_df = pd.DataFrame(submission_data, columns=['ID', 'Pred'])

    # Save to CSV
    submission_df.to_csv(output_file, index=False)

    print(f"Submission file saved to: {output_file}")

# Create submission file
create_submission_file_2025(team_win_probs_men, team_win_probs_women, output_file='submission_2025.csv')

IndentationError: unexpected indent (<ipython-input-8-e847bafdb6ba>, line 39)

Random Forest

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split

# Assuming X and y are defined from your previous data loading and feature engineering
# For example, if using the Men's data:
# X = X_men
# y = y_men

# OR if using the Women's data:
# X = X_women
# y = y_women

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Calibrate the model
calibrated_rf = CalibratedClassifierCV(rf_model, method='isotonic', cv=5)

# Train the calibrated model
calibrated_rf.fit(X_train, y_train)  # Now X_train and y_train are defined

# Predict probabilities
probabilities = calibrated_rf.predict_proba(X_test)[:, 1]

NameError: name 'X' is not defined

Gradient Boosting Machines

In [6]:
import lightgbm as lgb
from sklearn.calibration import CalibratedClassifierCV

# Assuming X_train, y_train, X_test are defined

# Create a LightGBM model
lgbm_model = lgb.LGBMClassifier(objective='binary', random_state=42)

# Calibrate the model
calibrated_lgbm = CalibratedClassifierCV(lgbm_model, method='isotonic', cv=5)

# Train the calibrated model
calibrated_lgbm.fit(X_train, y_train)

# Predict probabilities
probabilities = calibrated_lgbm.predict_proba(X_test)[:, 1]

NameError: name 'X_train' is not defined

Grid Search CV (LightGBM)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import brier_score_loss

brier_scorer = make_scorer(brier_score_loss, greater_is_better=False) #Brier score loss must be minimized.

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, scoring=brier_scorer, cv=5)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

NameError: name 'X_train' is not defined

K-Fold Time Series Split

In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import brier_score_loss
from sklearn.metrics import make_scorer
import numpy as np

# Assuming X, y are your features and target
tscv = TimeSeriesSplit(n_splits=5) # Adjust n_splits as needed
model = RandomForestClassifier(random_state=42)
brier_scorer = make_scorer(brier_score_loss, greater_is_better=False)
brier_scores = []

for train_index, test_index in tscv.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_test)[:,1]
    brier = brier_score_loss(y_test, y_pred)
    brier_scores.append(brier)

print(f"Brier Scores: {brier_scores}")
print(f"Mean Brier Score: {np.mean(brier_scores)}")

NameError: name 'X' is not defined

Calibrated Classifier

In [7]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Assuming X, y are defined
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a base model (e.g., Random Forest)
base_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Calibrate the model using CalibratedClassifierCV
calibrated_model = CalibratedClassifierCV(base_model, method='isotonic', cv=5) #Or method='platt'

# Train the calibrated model
calibrated_model.fit(X_train, y_train)

# Predict calibrated probabilities
calibrated_probabilities = calibrated_model.predict_proba(X_test)[:, 1]

NameError: name 'X' is not defined

Submission

In [None]:
import pandas as pd
import itertools

def create_submission_file_2025(predictions_dict, output_file='submission.csv', season=2025):
    """
    Creates a submission file for the 2025 NCAA tournament prediction competition.

    Args:
        predictions_dict (dict): A dictionary where keys are team IDs (int) and
                                  values are predicted probabilities (float)
                                  of winning against any other team.
        output_file (str): Name of the output CSV file. Defaults to 'submission.csv'.
        season (int): The season year (defaults to 2025).
    """

    # Get all unique team IDs
    team_ids = list(predictions_dict.keys())

    # Generate all possible matchups
    matchups = list(itertools.combinations(team_ids, 2))

    # Create submission DataFrame
    submission_data = []
    for team1, team2 in matchups:
        # Create ID
        id_str = f"{season}_{min(team1, team2)}_{max(team1, team2)}"

        # Get prediction (probability of lower-ID team winning)
        pred = predictions_dict[min(team1, team2)]  # Replace with your actual prediction logic

        submission_data.append([id_str, pred])

    submission_df = pd.DataFrame(submission_data, columns=['ID', 'Pred'])

    # Save to CSV
    submission_df.to_csv(output_file, index=False)

    print(f"Submission file saved to: {output_file}")

# Example usage (replace with your actual prediction logic)
# Assume you have a dictionary 'team_win_probs' with predicted win probabilities for each team
team_win_probs = {
    1101: 0.6,
    1102: 0.55,
    1103: 0.48,
    # ... (add more teams and their win probabilities) ...
}

# Create the submission file
create_submission_file_2025(team_win_probs, output_file='submission_2025.csv')