In [200]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
import collections

In [153]:
# read in the cleaned data
data = pd.read_csv("./data/match_data_cleaned.csv")
data.head()

Unnamed: 0,Match ID,Assists,Assists Turnover Ratio,Bench Points,Biggest Lead,Biggest Scoring Run,Blocks,Blocks Received,Defensive Points Per Possession,Defensive Rating,...,Transition Defence,Transition Offence,Turnovers,Turnovers Team,Two Pointers Attempted,Two Pointers Made,Two Pointers Percentage,Team ID,Team Name,Win
0,1007557,20,1.538462,8,23,0,3,0,0.0,0.0,...,0.0,2.333333,13,1,26,13,0.5,103674,Hampton,1
1,1007557,15,2.5,27,1,0,0,3,0.0,0.0,...,0.0,0.0,6,0,25,14,0.56,103826,Longwood,0
2,1007560,29,2.64,34,84,22,8,1,0.55,54.74,...,0.86,2.87,11,0,52,33,0.63,103549,Duke,1
3,1007560,10,0.42,26,0,4,1,8,1.51,150.55,...,2.87,0.86,24,1,45,13,0.29,103602,Ferris St.,0
4,1007570,10,0.91,36,5,0,1,1,0.0,0.0,...,0.0,0.0,11,1,43,23,0.53,103364,Boston U.,1


In [181]:
# Build the model
features = [
    "Assists",
    "Assists Turnover Ratio",
    "Bench Points",
    "Biggest Lead",
    "Blocks",
    "Blocks Received",
    "Defensive Points Per Possession",
    "Fast Break Points Percentage",
    "Field Goals Percentage",
    "Fouls Total",
    "Free Throws Percentage",
    "Lead Changes",
    "Offensive Points Per Possession",
    "Points",
    "Points Against",
    "Rebounds Defensive",
    "Rebounds Offensive",
    "Rebounds Total",
    "Second Chance Points Percentage",
    "Steals",
    "Three Pointers Percentage",
    "Turnovers",
    "Two Pointers Percentage",
]
target = "Win"
data[features] = data[features].fillna(data[features].mean())
X = data[features]
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
)

# Initialize the RandomForest Classifier
model = LinearRegression()
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

# Since we're using Linear Regression inappropriately for classification, we can round predictions
# to retrieve binary outcomes: 0 for lose, 1 for win
y_pred_binary = np.round(y_pred)

# Calculate the accuracy manually (since we're using regression for classification)
accuracy = np.mean(y_pred_binary == y_test)
print(f"Accuracy: {accuracy:.2f}")

# Mean Squared Error
mse = mean_squared_error(y_test, y_pred_binary)
print(f"Mean Squared Error: {mse:.2f}")

Accuracy: 0.94
Mean Squared Error: 718546015483623680.00


In [174]:
# map the team name to the team ID
team_mapping = dict(zip(data["Team ID"], data["Team Name"]))

team_avg = data.groupby("Team ID")[features].mean()

team_avg["Team Name"] = team_avg.index.map(team_mapping)

In [184]:
# Predicting the outcome of a match
try:
    team_name1 = input("Enter a team name: ")
    input_row1 = team_avg[team_avg["Team Name"] == team_name1]
    prediction1 = model.predict(input_row1[features])
    team_name2 = input("Enter another team name: ")
    input_row2 = team_avg[team_avg["Team Name"] == team_name2]
    prediction2 = model.predict(input_row2[features])
    print(
        f"The winrate of {team_name1} {prediction1[0]}. The win rate of {team_name2} {prediction2[0]}"
    )
    total = prediction1[0] + prediction2[0]
    if prediction1[0] > prediction2[0]:
        print(
            f"{team_name1} is more likely to win. With a win percentage of {prediction1[0]/total}"
        )
    else:
        print(
            f"{team_name2} is more likely to win. With a win percentage of {prediction2[0]/total}"
        )
except Exception as e:
    print("Team not found")
    print(e)

The winrate of UVA Wise 0.39021012129645893. The win rate of Duke 0.778072442517119
Duke is more likely to win. With a win percentage of [0.66599679]


In [193]:
# Calculate the probability of a team making it to the next round
try:
    team_name = input("Enter a team name: ")
    input_row = team_avg[team_avg["Team Name"] == team_name]
    prediction = model.predict(input_row[features])
    print(
        f"""Probability of {team_name} making it to Round of 64 {prediction[0]}
Probability of {team_name} making it to Round of 32 {pow(prediction[0], 2)}
Probability of {team_name} making it to Sweet 16 {pow(prediction[0], 3)}
Probability of {team_name} making it to Elite 8 {pow(prediction[0], 4)}
Probability of {team_name} making it to Final 4 {pow(prediction[0], 5)}
Probability of {team_name} making it to Championship {pow(prediction[0], 6)}
Probability of {team_name} winning the Championship {pow(prediction[0], 7)}"""
    )
except Exception as e:
    print("Team not found")
    print(e)

Probability of Virginia Tech making it to Round of 64 0.6286472994509066
Probability of Virginia Tech making it to Round of 32 0.3951974271069178
Probability of Virginia Tech making it to Sweet 16 0.2484397953007104
Probability of Virginia Tech making it to Elite 8 0.1561810063919276
Probability of Virginia Tech making it to Final 4 0.09818276789381007
Probability of Virginia Tech making it to Championship 0.06172233188905888
Probability of Virginia Tech winning the Championship 0.03880157725786944


In [203]:
# Ask the user for the seeding of the teams Final 4
try:
    team_matches = collections.defaultdict(str)
    team_predictions = collections.defaultdict(float)

    team_name1 = input("Enter a team name (seed 1): ")
    input_row1 = team_avg[team_avg["Team Name"] == team_name1]
    prediction1 = model.predict(input_row1[features])
    team_predictions[team_name1] = prediction1[0]

    team_name2 = input("Enter another team name (seed 2): ")
    input_row2 = team_avg[team_avg["Team Name"] == team_name2]
    prediction2 = model.predict(input_row2[features])
    team_predictions[team_name2] = prediction2[0]

    team_name3 = input("Enter another team name (seed 3): ")
    input_row3 = team_avg[team_avg["Team Name"] == team_name3]
    prediction3 = model.predict(input_row3[features])
    team_predictions[team_name3] = prediction3[0]

    team_name4 = input("Enter another team name (seed 4): ")
    input_row4 = team_avg[team_avg["Team Name"] == team_name4]
    prediction4 = model.predict(input_row4[features])
    team_predictions[team_name4] = prediction4[0]

    print(
        f"""Probability of {team_name1} (seed 1) winning against {team_name4} (seed 4) {prediction1[0]/(prediction1[0]+prediction4[0])}"""
    )
    team_name14 = input(
        f"Enter in the winner of the previous match: {team_name1} (seed 1) vs {team_name4} (seed 4)"
    )

    team_predictions[team_name14] = (
        prediction4 if team_name14 == team_name1 else prediction1
    )
    input_row14 = team_avg[team_avg["Team Name"] == team_name14]
    prediction14 = model.predict(input_row14[features])

    print(
        f"""Probability of {team_name2} (seed 2) winning against {team_name3} (seed 3) {prediction2[0]/(prediction2[0]+prediction3[0])}"""
    )
    team_name23 = input(
        f"Enter in the winner of the previous match: {team_name2} (seed 2) vs {team_name3} (seed 3)"
    )
    input_row23 = team_avg[team_avg["Team Name"] == team_name23]
    prediction23 = model.predict(input_row23[features])
    team_predictions[team_name23] = (
        prediction3 if team_name23 == team_name2 else prediction2
    )
    print(
        f"""Probability of {team_name14} winning against {team_name23} {prediction14[0]/(prediction14[0]+prediction23[0])}"""
    )
    team_name_final = input(
        f"Enter in the winner of the previous match: {team_name14} vs {team_name23}"
    )
    input_row_final = team_avg[team_avg["Team Name"] == team_name_final]
    prediction_final = model.predict(input_row_final[features])

    print(
        f"""The probability of {team_name_final} winning the championship is {(prediction_final[0]/(prediction14[0]+prediction23[0]) * prediction_final[0]/(prediction_final[0] + team_predictions[team_name_final]))[0]}"""
    )


except Exception as e:
    print("Team not found")
    print(e)

Probability of Virginia Tech (seed 1) winning against Purdue (seed 4) 0.4742625171101881
Probability of Duke (seed 2) winning against UConn (seed 3) 0.5294230319121791
Probability of Virginia Tech winning against Duke 0.44688880144058973
The probability of Virginia Tech winning the championship is 0.21194260783956914
