## Creation of a Basketball Prediction Model

In [74]:
#Laoding packeges into python
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Load the data from the uploaded file
data = pd.read_excel('FullData.xlsx')

# Select data from 2016
data['Date'] = pd.to_datetime(data['Date'])
data_2016 = data[data['Date'].dt.year >= 2016]

# 2. Remove rows with more than 8 missing values
data_2016 = data_2016.dropna(thresh=data_2016.shape[1] - 8)
data_2016 = data_2016.dropna(subset=['Home score', 'Away score'])
data_20161 = data_2016

# 3. Fill the missing values in the 'Timeout' columns with the mean timeout
data_2016['Home Timeouts'].fillna(data_2016['Home Timeouts'].mean(), inplace=True)
data_2016['Away Timeouts'].fillna(data_2016['Away Timeouts'].mean(), inplace=True)

# Sorting data by date for correct calculation
data_2016 = data_2016.sort_values(by='Date')

def extract_values_from_string(s):
    '''Extracts score, attempts, and proportion from a string of the format '19/29 (66%)'.'''
    score, attempts = map(int, s.split(' ')[0].split('/'))
    proportion = int(s.split('(')[1].split('%')[0]) / 100
    return score, attempts, proportion

# List of columns to extract values from
columns_to_extract = ['Home Free throws', 'Away Free throws', 'Home 2 pointers', 'Away 2 pointers',
                      'Home 3 pointers', 'Away 3 pointers', 'Home Field goals', 'Away Field goals']

for col in columns_to_extract:
    data_2016[col + ' Score'], data_2016[col + ' Attempts'], data_2016[col + ' Proportion'] = zip(*data_2016[col].map(extract_values_from_string))
    data_2016.drop(col, axis=1, inplace=True)

# Compute the pace for each game
data_2016['Pace'] = (data_2016['Home Field goals Attempts'] + data_2016['Away Field goals Attempts'] -
                    (data_2016['Home Offensive rebounds'] + data_2016['Away Offensive rebounds']) +
                    (data_2016['Home Turnovers'] + data_2016['Away Turnovers']) +
                    0.4 * (data_2016['Home Free throws Attempts'] + data_2016['Away Free throws Attempts'])) / 2



# Initialize ELO ratings
initial_elo = 1550
# Define the league average time
league_averag = 60

teams = set(data_2016['Home team']).union(set(data_2016['Away team']))

# Function to calculate the expected outcome of a match
def expected_outcome(rating1, rating2):
    return 1 / (1 + 10**((rating2 - rating1) / 400))

def mov_multiplier(margin, R_winning_team, R_losing_team):
    """Calculate Margin of Victory Multiplier."""
    return np.log(abs(margin) + 1) * (2.2 / (2.2 + (R_winning_team - R_losing_team) / 400))

import math

# Initialize Elo ratings
initial_elo = 1550
teams = data_2016['Home team'].unique()
elo_ratings = {team: initial_elo for team in teams}

# Update the ELO update function to include MOV, home ground advantage, game pace, and seasonality
def update_elo_with_all_adjustments(home_team, away_team, home_score, away_score, pace, k=10, home_ground_advantage=50):
    # Calculate expected outcomes
    expected_home = expected_outcome(elo_ratings[home_team] , elo_ratings[away_team] + home_ground_advantage)
    expected_away = 1 - expected_home

    # Calculate actual outcomes and margin of victory
    margin = (abs(home_score - away_score) * league_averag) / pace
    if home_score > away_score:
        actual_home = 1
        actual_away = 0
        mov = mov_multiplier(margin, elo_ratings[home_team], elo_ratings[away_team])
    elif home_score < away_score:
        actual_home = 0
        actual_away = 1
        mov = mov_multiplier(margin, elo_ratings[away_team], elo_ratings[home_team])
    else:
        actual_home = 0.5
        actual_away = 0.5
        mov = 1

    # Update ratings and return them
    new_home_elo = elo_ratings[home_team] + k * mov * (actual_home - expected_home)
    new_away_elo = elo_ratings[away_team] + k * mov * (actual_away - expected_away)
    elo_ratings[home_team] = new_home_elo
    elo_ratings[away_team] = new_away_elo
    return new_home_elo, new_away_elo

# Create columns to store Current ELO ratings for home and away teams after the game
data_2016['Home ELO'] = 0
data_2016['Away ELO'] = 0
# Create columns to store Previous ELO ratings for home and away teams before the game

# Update ELO ratings for each game in the dataset and record previous ELO
for index, row in data_2016.iterrows():
    # Store previous ELO ratings
    data_2016.at[index, 'Previous Home ELO'] = elo_ratings[row['Home team']]
    data_2016.at[index, 'Previous Away ELO'] = elo_ratings[row['Away team']]
    
    # Update ELO ratings
    home_elo, away_elo = update_elo_with_all_adjustments(row['Home team'], row['Away team'], row['Home score'], 
                                                         row['Away score'], row['Pace'])
    data_2016.at[index, 'Home ELO'] = home_elo
    data_2016.at[index, 'Away ELO'] = away_elo

# Apply seasonality adjustment at the end of the season
regression_factor = 1/3
mean_elo = np.mean(list(elo_ratings.values()))
for team in elo_ratings:
    elo_ratings[team] = elo_ratings[team] * (1 - regression_factor) + mean_elo * regression_factor
    
    
# Adding a rolling average for 'Biggest lead'

# Calculating a rolling average for biggset lead
window_size = 3  # Setting a window size of 3 games as an example
    
# Calculating a rolling sum for the 'Home Biggest lead' grouped by 'Home team'
rolling_sum_home = data_2016.groupby('Home team')['Home Biggest lead'].rolling(window=window_size).sum().reset_index(level=0, drop=True)
data_2016['Home Previous Biggest lead Rolling'] = (rolling_sum_home - data_2016['Home Biggest lead'])

# Calculating a rolling sum for the 'Away Biggest lead' grouped by 'Away team'
rolling_sum_away = data_2016.groupby('Away team')['Away Biggest lead'].rolling(window=window_size).sum().reset_index(level=0, drop=True)
data_2016['Away Previous Biggest lead Rolling'] = (rolling_sum_away - data_2016['Away Biggest lead'])

#data_2016['Previous Home ELO'] = data_2016.groupby('Home team')['Previous Home ELO'].rolling(window=window_size).sum().reset_index(level=0, drop=True)
#data_2016['Previous Away ELO'] = data_2016.groupby('Away team')['Previous Away ELO'].rolling(window=window_size).sum().reset_index(level=0, drop=True)
    
data_2016 = data_2016.fillna(0,inplace=False)
#data_2016 = data_2016.dropna()


from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
# Features for the model
features = ['Previous Home ELO', 'Previous Away ELO','Home Previous Biggest lead Rolling',
            'Away Previous Biggest lead Rolling']

# Extracting features and target variable
X = data_2016[features]
y = data_2016['Home score'] > data_2016['Away score']

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.269, random_state=32)

# Hyperparameters for grid search
param_grid = {
    'C': np.logspace(-4, 4, 20),
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

# Grid search with 5-fold cross validation
grid_search = GridSearchCV(LogisticRegression(), param_grid, scoring='neg_log_loss', cv=5)
grid_search.fit(X_train, y_train)

# Using best estimator from grid search for predictions
best_model = grid_search.best_estimator_
y_pred_probs_best = best_model.predict_proba(X_test)

# Calculating log loss with best model
#best_loss = log_loss(y_test, y_pred_probs_best)

# Predicting win outcomes on entire dataset using best model
data_2016['Home Win'] = (data_2016['Home score'] > data_2016['Away score']).astype(int)
data_2016['Away Win'] = (data_2016['Home score'] < data_2016['Away score']).astype(int)

# Storing win probabilities for each team
data_2016['Home Win Probability'] = best_model.predict_proba(X)[:, 1]
data_2016['Away Win Probability'] = best_model.predict_proba(X)[:, 0]

# Calculating log loss for each game and storing in a column
#data_2016['Log Loss'] = - (y * np.log(data_2016['Home Win Probability']) + (1 - y) * np.log(1 - data_2016['Home Win Probability']))
data_2016["Log Loss"] = data_2016.apply(
    lambda row: log_loss([row["Home Win"]], [[row["Away Win Probability"], row["Home Win Probability"]]], labels=[0, 1]), 
    axis=1
)

# Average log loss
average_log_loss_optimized = data_2016['Log Loss'].mean()

print('Log Loss :',average_log_loss_optimized.round(4))
#data_2016.to_csv('Data with prediction.csv', index=False)

Log Loss : 0.635


In [52]:
data_2016.tail().iloc[:,0:60]

Unnamed: 0.1,Unnamed: 0,Date,Home team,Away team,Home score,Away score,Home Rebounds,Away Rebounds,Home Defensive rebounds,Away Defensive rebounds,...,Home Field goals Score,Home Field goals Attempts,Home Field goals Proportion,Away Field goals Score,Away Field goals Attempts,Away Field goals Proportion,Pace,Home ELO,Away ELO,Previous Home ELO
10051,5,2023-05-10 01:30:00,Boston Celtics,Philadelphia 76ers,103.0,115.0,36.0,49.0,27.0,40.0,...,33,83,0.39,40,79,0.5,92.7,1652.635823,1619.77918,1667.311654
10052,6,2023-05-10 04:00:00,Denver Nuggets,Phoenix Suns,118.0,102.0,50.0,42.0,40.0,34.0,...,42,85,0.49,38,88,0.43,97.9,1597.553877,1573.541013,1580.671188
10053,7,2023-05-11 01:30:00,New York Knicks,Miami Heat,112.0,103.0,50.0,34.0,36.0,23.0,...,35,71,0.49,37,88,0.42,93.8,1579.37679,1582.916298,1564.193979
10054,8,2023-05-11 04:00:00,Golden State Warriors,Los Angeles Lakers,121.0,106.0,48.0,38.0,39.0,31.0,...,47,92,0.51,42,87,0.48,101.5,1576.050938,1582.276459,1557.770461
10055,9,2023-05-12 01:30:00,Philadelphia 76ers,Boston Celtics,86.0,95.0,38.0,50.0,32.0,44.0,...,30,83,0.36,33,78,0.42,95.5,1611.12503,1661.289974,1619.77918


## Predicting future games

In [80]:
# Loading the game match list from the provided Excel sheet
game_matches = pd.read_excel('Games.xlsx')

# Compute the unified ELO rating for each team based on their last game
latest_elo_unified = data_2016.groupby(['Home team', 'Date']).last()['Home ELO'].reset_index()
latest_elo_unified = latest_elo_unified.append(data_2016.groupby(['Away team', 'Date']).last()['Away ELO'].reset_index().rename(columns={'Away team': 'Home team', 'Away ELO': 'Home ELO'}))
latest_elo_unified = latest_elo_unified.sort_values(by=['Home team', 'Date']).groupby('Home team').last()['Home ELO']

# Compute the sum of biggest lead from last 2 games for each team
latest_rolling_lead_unified_home = data_2016.groupby('Home team')['Home Biggest lead'].rolling(window=2).sum().reset_index().set_index('Home team')['Home Biggest lead']
latest_rolling_lead_unified_away = data_2016.groupby('Away team')['Away Biggest lead'].rolling(window=2).sum().reset_index().set_index('Away team')['Away Biggest lead']

# Combine to get the latest rolling sum of biggest lead for each team
latest_rolling_lead_unified = latest_rolling_lead_unified_home.append(latest_rolling_lead_unified_away).groupby(level=0).last()

# Assigning ELO and biggest lead to the game match list
game_matches['Previous Home ELO'] = game_matches['Home team'].map(latest_elo_unified)
game_matches['Previous Away ELO'] = game_matches['Away team'].map(latest_elo_unified)
game_matches['Home Previous Biggest lead Rolling'] = game_matches['Home team'].map(latest_rolling_lead_unified)
game_matches['Away Previous Biggest lead Rolling'] = game_matches['Away team'].map(latest_rolling_lead_unified)

# Using the model to predict the outcome
X_matches = game_matches[['Previous Home ELO', 'Previous Away ELO','Home Previous Biggest lead Rolling',
            'Away Previous Biggest lead Rolling']]
game_matches['Home Win Probability'] = best_model.predict_proba(X_matches)[:, 1]
game_matches['Away Win Probability'] = 1 - game_matches['Home Win Probability']

game_matches.head()


game_matches.to_csv('Game prediction.csv')

In [24]:
data_2016.to_csv('Data with prediction.csv', index=False)