In [1]:
#Import necessary libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
import numpy as np

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mens-machine-learning-competition-2018/Conferences.csv
/kaggle/input/mens-machine-learning-competition-2018/GameCities.csv
/kaggle/input/mens-machine-learning-competition-2018/MasseyOrdinals.csv
/kaggle/input/mens-machine-learning-competition-2018/Events_2010.csv
/kaggle/input/mens-machine-learning-competition-2018/Events_2011.csv
/kaggle/input/mens-machine-learning-competition-2018/TeamCoaches_Prelim2018.csv
/kaggle/input/mens-machine-learning-competition-2018/NCAATourneySlots_SampleTourney2018.csv
/kaggle/input/mens-machine-learning-competition-2018/Players_2012.csv
/kaggle/input/mens-machine-learning-competition-2018/Players_2017.csv
/kaggle/input/mens-machine-learning-competition-2018/RegularSeasonCompactResults.csv
/kaggle/input/mens-machine-learning-competition-2018/Events_2017.csv
/kaggle/input/mens-machine-learning-competition-2018/Events_2012.csv
/kaggle/input/mens-machine-learning-competition-2018/NCAATourneyDetailedResults.csv
/kaggle/input/mens-machine-learnin

In [2]:
# Preparing Data 
# This dataset now includes a 'home_court' feature.
data = {
    'team_a_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
    'team_b_id': [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32],
    'team_a_rating': [85, 92, 78, 88, 95, 80, 89, 75, 91, 84, 93, 79, 87, 90, 83, 96],
    'team_b_rating': [70, 85, 80, 90, 82, 75, 92, 70, 88, 86, 77, 81, 84, 89, 76, 74],
    'home_court_advantage': [1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0], # 1 if team A is home, 0 otherwise
    'team_a_won': [1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1] # 1 if team A won, 0 otherwise
}
df = pd.DataFrame(data)
df

Unnamed: 0,team_a_id,team_b_id,team_a_rating,team_b_rating,home_court_advantage,team_a_won
0,1,17,85,70,1,1
1,2,18,92,85,0,1
2,3,19,78,80,0,0
3,4,20,88,90,1,0
4,5,21,95,82,1,1
5,6,22,80,75,0,1
6,7,23,89,92,1,0
7,8,24,75,70,0,1
8,9,25,91,88,1,1
9,10,26,84,86,0,0


In [3]:
#Showing Data 

print("Original Data with Home Court Advantage:")
print(df)
print("\n" + "="*50 + "\n")

Original Data with Home Court Advantage:
    team_a_id  team_b_id  team_a_rating  team_b_rating  home_court_advantage  \
0           1         17             85             70                     1   
1           2         18             92             85                     0   
2           3         19             78             80                     0   
3           4         20             88             90                     1   
4           5         21             95             82                     1   
5           6         22             80             75                     0   
6           7         23             89             92                     1   
7           8         24             75             70                     0   
8           9         25             91             88                     1   
9          10         26             84             86                     0   
10         11         27             93             77                     1   

In [4]:
# Feature Engineering 

# We now use both rating difference and home court advantage as features. 


df['rating_diff'] = df['team_a_rating'] - df['team_b_rating']

df['rating_diff'] # # Instead of giving the model two separate ratings (team_a_rating and team_b_rating) and forcing it to learn how to compare them, you provide a single feature that directly represents the difference in their strengths
# Our features now include rating difference and home court.
X = df[['rating_diff', 'home_court_advantage']]
y = df['team_a_won'] 



Using the team_a_id and team_b_id directly as features is not a good practice for this type of model. The reason is that these IDs are just arbitrary numbers used to identify the teams. They don't have any inherent value or meaning that the model can learn from 
the model needs features that actually represent the teams' strengths and the context of the matchup. This is why the code uses features like rating_diff and home_court_advantage. These features provide the model with a quantitative measure of how the teams compare, allowing it to learn the patterns that lead to a win or a loss.

In [5]:
# the Final data will be 

# X for Features and y for Output 

X , y

(    rating_diff  home_court_advantage
 0            15                     1
 1             7                     0
 2            -2                     0
 3            -2                     1
 4            13                     1
 5             5                     0
 6            -3                     1
 7             5                     0
 8             3                     1
 9            -2                     0
 10           16                     1
 11           -2                     0
 12            3                     0
 13            1                     1
 14            7                     1
 15           22                     0,
 0     1
 1     1
 2     0
 3     0
 4     1
 5     1
 6     0
 7     1
 8     1
 9     0
 10    1
 11    0
 12    0
 13    1
 14    1
 15    1
 Name: team_a_won, dtype: int64)

In [6]:
# Split the data into training and testing data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train, X_test, y_train, y_test

(    rating_diff  home_court_advantage
 4            13                     1
 14            7                     1
 2            -2                     0
 8             3                     1
 0            15                     1
 6            -3                     1
 3            -2                     1
 15           22                     0
 13            1                     1
 12            3                     0
 1             7                     0
 5             5                     0,
     rating_diff  home_court_advantage
 11           -2                     0
 10           16                     1
 9            -2                     0
 7             5                     0,
 4     1
 14    1
 2     0
 8     1
 0     1
 6     0
 3     0
 15    1
 13    1
 12    0
 1     1
 5     1
 Name: team_a_won, dtype: int64,
 11    0
 10    1
 9     0
 7     1
 Name: team_a_won, dtype: int64)

In [7]:
# Train a Machine Learning Model ---
# We'll use Logistic Regression for this example.
model = LogisticRegression()
model.fit(X_train, y_train)

print("Model training complete.")
print("Model coefficients:", model.coef_)
print("\n" + "="*50 + "\n")


Model training complete.
Model coefficients: [[0.66841342 0.58189486]]




> The purpose of these coefficients is to show the relationship between each feature and the predicted outcome. In this model, you would have two coefficients: one for rating_diff and one for home_court_advantage.

>A positive coefficient for rating_diff means that a larger rating difference increases the probability of Team A winning.

>A positive coefficient for home_court_advantage means that playing at home increases the probability of Team A winning.

>The model calculates the optimal values for these coefficients to make the most accurate predictions possible. The magnitude of the coefficient tells you how much each feature influences the outcome, with larger values indicating a stronger influence.

In [8]:
#  Make predictions and evaluate the model ---
# For competition-style scoring, log loss is often used.
y_pred_proba = model.predict_proba(X_test)[:, 1] # Get probabilities for Team A winning
logloss_score = log_loss(y_test, y_pred_proba)

y_pred_proba
print(f"Model Log Loss on Test Data: {logloss_score:.4f}")
print("\n" + "="*50 + "\n")

Model Log Loss on Test Data: 0.0650




> Based on the logloss_score of 0.0613 from the Canvas, here is an interpretation of what that result means:

> Log loss is a common metric in machine learning competitions like this, where the goal is to predict probabilities. The score ranges from 0 to infinity, and a perfect model would have a log loss of 0. A higher score indicates a worse-performing model.

> A log loss score of 0.0613 is extremely low. This indicates that your model is performing exceptionally well on the test data. It is not only making correct predictions but is also doing so with high confidence, which is exactly what a competition like the March Madness one would reward. This score suggests that the features and model you've chosen are highly effective for this dataset.

In [9]:
# Function to simulate a tournament ---
def simulate_tournament(teams, model):
    """
    Simulates a tournament bracket for a given set of teams using the trained model.
    teams: a list of dictionaries, where each dict has 'id' and 'rating'.
    model: the trained Logistic Regression model.
    """
    if len(teams) % 2 != 0 or len(teams) == 0:
        print("Invalid number of teams. Must be a power of 2.")
        return []

    print(f"Simulating a tournament with {len(teams)} teams...")

    round_teams = list(teams)
    while len(round_teams) > 1:
        winners = []
        # Pair up teams for the current round
        for i in range(0, len(round_teams), 2):
            team_a = round_teams[i]
            team_b = round_teams[i+1]
            
            # Predict the winner using our model (we assume no home-court advantage in a neutral tournament)
            rating_diff = team_a['rating'] - team_b['rating']
            matchup_features = pd.DataFrame({'rating_diff': [rating_diff], 'home_court_advantage': [0]})
            
            prob_a_wins = model.predict_proba(matchup_features)[0][1]

            # The winner is the team with the higher probability of winning
            winner = team_a if prob_a_wins > 0.5 else team_b

            print(f"Matchup: Team {team_a['id']} (rating: {team_a['rating']}) vs Team {team_b['id']} (rating: {team_b['rating']})")
            print(f"  -> Predicted probability of Team {team_a['id']} winning: {prob_a_wins:.2f}")
            print(f"  -> Winner: Team {winner['id']}")
            
            winners.append(winner)

        round_teams = winners
        print("-" * 20)
    
    return round_teams[0]


In [10]:
# Example Tournament Simulation ---
# Define a set of teams for a small, 8-team tournament
tournament_teams = [
    {'id': 1, 'rating': 95}, {'id': 2, 'rating': 80},
    {'id': 3, 'rating': 90}, {'id': 4, 'rating': 85},
    {'id': 5, 'rating': 75}, {'id': 6, 'rating': 88},
    {'id': 7, 'rating': 92}, {'id': 8, 'rating': 70},
]

champion = simulate_tournament(tournament_teams, model)
print(f"\nTournament Champion: Team {champion['id']} with a rating of {champion['rating']}")


Simulating a tournament with 8 teams...
Matchup: Team 1 (rating: 95) vs Team 2 (rating: 80)
  -> Predicted probability of Team 1 winning: 1.00
  -> Winner: Team 1
Matchup: Team 3 (rating: 90) vs Team 4 (rating: 85)
  -> Predicted probability of Team 3 winning: 0.88
  -> Winner: Team 3
Matchup: Team 5 (rating: 75) vs Team 6 (rating: 88)
  -> Predicted probability of Team 5 winning: 0.00
  -> Winner: Team 6
Matchup: Team 7 (rating: 92) vs Team 8 (rating: 70)
  -> Predicted probability of Team 7 winning: 1.00
  -> Winner: Team 7
--------------------
Matchup: Team 1 (rating: 95) vs Team 3 (rating: 90)
  -> Predicted probability of Team 1 winning: 0.88
  -> Winner: Team 1
Matchup: Team 6 (rating: 88) vs Team 7 (rating: 92)
  -> Predicted probability of Team 6 winning: 0.02
  -> Winner: Team 7
--------------------
Matchup: Team 1 (rating: 95) vs Team 7 (rating: 92)
  -> Predicted probability of Team 1 winning: 0.67
  -> Winner: Team 1
--------------------

Tournament Champion: Team 1 with a 