In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Importing the ability to One Hot Encode our data
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None, 'display.max_columns', None)

# Preparing the Training Data

In [8]:
# Pulling in our Training Data
training_data = pd.read_excel('NBA_Stats_2000.xlsx')

In [9]:
# Fixing the "Net Rating", "Pace", and "Attendance" variables
pace_list = []
for i, value in enumerate(training_data['Pace']):
    value = float(value)
    pace_list.append(value)
training_data['Pace'] = pace_list

net_rating_list = []
for i, value in enumerate(training_data['Net Rating']):
    value = float(value)
    net_rating_list.append(value)
training_data['Net Rating'] = net_rating_list

attendance_list = []
for i, value in enumerate(training_data['Attendance']):
    value = str(value)
    value = value.replace(',', '')
    value = value.replace('.', '')
    value = int(value)
    attendance_list.append(value)
training_data['Attendance'] = attendance_list

In [11]:
# Creating different variables
training_data['Assist_to_Turnover_Ratio'] = training_data['Off Assists'] / training_data['Off Turnovers']
training_data['Margin_of_Victory'] = training_data['Off Points'] - training_data['Opp Points']

In [10]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 715 entries, 0 to 714
Data columns (total 66 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Rk                         715 non-null    int64  
 1   Team                       715 non-null    object 
 2   Games Played               715 non-null    int64  
 3   Minutes Played             715 non-null    float64
 4   Opp Field Goals            715 non-null    float64
 5   Opp Field Goal Attempts    715 non-null    float64
 6   Opp Field Goal Percentage  715 non-null    float64
 7   Opp 3-Pointers             715 non-null    float64
 8   Opp 3-Pointer Attempts     715 non-null    float64
 9   Opp 3-Pointer Percentage   715 non-null    float64
 10  Opp 2-Pointers             715 non-null    float64
 11  Opp 2-Pointer Attempts     715 non-null    float64
 12  Opp 2-Pointer Percentage   715 non-null    float64
 13  Opp Free-Throws            715 non-null    float64

In [36]:
# One Hot Encoding the Training Data
# Let's get the Categorical Variables
categorical_train = training_data[['Champion', 'Top 3 Conference', 'Division', 'Conference', 'MVP']]
numerical_train = training_data.drop(['Champion', 'Top 3 Conference', 'Division', 'Conference', 'Team', 'MVP'], axis = 1)

# Creating the One Hot Encoder
encoder = OneHotEncoder(drop = 'if_binary', sparse_output = False).set_output(transform = 'pandas')
# Encoding the Categorical Data
encoded_data = encoder.fit_transform(categorical_train)
# Having the finalized dataset
train_hot_encoded = numerical_train.join(encoded_data)

In [37]:
# Looking at the Correlations
train_hot_encoded.corr()['Champion_Yes'].sort_values(ascending = False)[:12]

Champion_Yes                 1.000000
Top 3 Conference_Yes         0.371111
MVP_Yes                      0.326444
Longest Win Streak           0.299523
W/L%                         0.277664
Accolades                    0.276180
Margin_of_Victory            0.273716
Net Rating                   0.273677
SRS                          0.273275
W                            0.273071
Off Field Goal Percentage    0.217269
Mean Exp                     0.209728
Name: Champion_Yes, dtype: float64

In [38]:
train_hot_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 715 entries, 0 to 714
Data columns (total 72 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Rk                         715 non-null    int64  
 1   Games Played               715 non-null    int64  
 2   Minutes Played             715 non-null    float64
 3   Opp Field Goals            715 non-null    float64
 4   Opp Field Goal Attempts    715 non-null    float64
 5   Opp Field Goal Percentage  715 non-null    float64
 6   Opp 3-Pointers             715 non-null    float64
 7   Opp 3-Pointer Attempts     715 non-null    float64
 8   Opp 3-Pointer Percentage   715 non-null    float64
 9   Opp 2-Pointers             715 non-null    float64
 10  Opp 2-Pointer Attempts     715 non-null    float64
 11  Opp 2-Pointer Percentage   715 non-null    float64
 12  Opp Free-Throws            715 non-null    float64
 13  Opp Free-Throw Attempts    715 non-null    float64

In [39]:
# Creating our Feature Vector and and Target Vector
x_train = train_hot_encoded.drop(['Champion_Yes'], axis = 1)
y_train = train_hot_encoded['Champion_Yes']

# Preparing the Testing Data

In [50]:
testing_data = pd.read_excel('Testing Data.xlsx')
# Creating New Variables
testing_data['Assist_to_Turnover_Ratio'] = testing_data['Off Assists'] / testing_data['Off Turnovers']
testing_data['Margin_of_Victory'] = testing_data['Off Points'] - testing_data['Opp Points']

In [51]:
# One Hot Encoding the Training Data
# Let's get the Categorical Variables
categorical_test = testing_data[['Champion', 'Top 3 Conference', 'Division', 'Conference', 'MVP']]
numerical_test = testing_data.drop(['Champion', 'Top 3 Conference', 'Division', 'Conference', 'Team', 'MVP'], axis = 1)

# Creating the One Hot Encoder
encoder = OneHotEncoder(drop = 'if_binary', sparse_output = False).set_output(transform = 'pandas')
# Encoding the Categorical Data
encoded_data = encoder.fit_transform(categorical_test)
# Having the finalized dataset
test_hot_encoded = numerical_test.join(encoded_data)

In [52]:
x_test = test_hot_encoded.drop(['Champion_No'], axis = 1)

# Creating our Model

In [53]:
# Creating our Logistic Regression Model
log_model = LogisticRegression()
log_model.fit(x_train, y_train)

In [54]:
x_train.columns

Index(['Rk', 'Games Played', 'Minutes Played', 'Opp Field Goals',
       'Opp Field Goal Attempts', 'Opp Field Goal Percentage',
       'Opp 3-Pointers', 'Opp 3-Pointer Attempts', 'Opp 3-Pointer Percentage',
       'Opp 2-Pointers', 'Opp 2-Pointer Attempts', 'Opp 2-Pointer Percentage',
       'Opp Free-Throws', 'Opp Free-Throw Attempts',
       'Opp Free-Throw Percentage', 'Opp Offensive Rebounds',
       'Opp Defensive Rebounds', 'Opp Total Rebounds', 'Opp Assists',
       'Opp Steals', 'Opp Blocks', 'Opp Turnovers', 'Opp Personal Fouls',
       'Opp Points', 'Year', 'Longest Win Streak', 'Accolades',
       'Off Field Goals', 'Off Field Goal Attempts',
       'Off Field Goal Percentage', 'Off 3-Pointers', 'Off 3-Pointer Attempts',
       'Off 3-Pointer Percentage', 'Off 2-Pointers', 'Off 2-Pointer Attempts',
       'Off 2-Pointer Percentage', 'Off Free-Throws',
       'Off Free-Throw Attempts', 'Off Free-Throw Percentage',
       'Off Offensive Rebounds', 'Off Defensive Rebounds',
  

In [55]:
x_test.columns

Index(['Rk', 'Games Played', 'Minutes Played', 'Opp Field Goals',
       'Opp Field Goal Attempts', 'Opp Field Goal Percentage',
       'Opp 3-Pointers', 'Opp 3-Pointer Attempts', 'Opp 3-Pointer Percentage',
       'Opp 2-Pointers', 'Opp 2-Pointer Attempts', 'Opp 2-Pointer Percentage',
       'Opp Free-Throws', 'Opp Free-Throw Attempts',
       'Opp Free-Throw Percentage', 'Opp Offensive Rebounds',
       'Opp Defensive Rebounds', 'Opp Total Rebounds', 'Opp Assists',
       'Opp Steals', 'Opp Blocks', 'Opp Turnovers', 'Opp Personal Fouls',
       'Opp Points', 'Year', 'Longest Win Streak', 'Accolades',
       'Off Field Goals', 'Off Field Goal Attempts',
       'Off Field Goal Percentage', 'Off 3-Pointers', 'Off 3-Pointer Attempts',
       'Off 3-Pointer Percentage', 'Off 2-Pointers', 'Off 2-Pointer Attempts',
       'Off 2-Pointer Percentage', 'Off Free-Throws',
       'Off Free-Throw Attempts', 'Off Free-Throw Percentage',
       'Off Offensive Rebounds', 'Off Defensive Rebounds',
  

In [57]:
probabilities = log_model.predict_proba(x_test)
probabilities_df = pd.DataFrame(data = probabilities)
normalized_probabilities_df = probabilities_df.div(probabilities_df.sum(axis=0), axis=1)
normalized_probabilities_df

Unnamed: 0,0,1
0,0.030428,0.111018
1,0.033198,0.036944
2,0.034012,0.015188
3,0.034163,0.011157
4,0.017488,0.456986
5,0.030642,0.105294
6,0.033926,0.017497
7,0.033778,0.021453
8,0.034107,0.012649
9,0.033473,0.029595


# Testing a Random Forest