In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
file_path = Path('../Toronto Raptors Analysis')

# Load the regular data
reg_df = pd.read_csv(f"{file_path}/raptors_regulars.csv")

# Load the playoff data
playoff_df = pd.read_csv(f"{file_path}/raptorsplayoffs.csv")

In [4]:
# Here we Inspect columns/data types

print(reg_df.dtypes)

print(playoff_df.dtypes)

TEAM        object
DATE        object
MATCHUP     object
W/L         object
MIN          int64
PTS          int64
FGM          int64
FGA          int64
FG%        float64
3PM          int64
3PA          int64
3P%        float64
FTM          int64
FTA          int64
FT%        float64
OREB         int64
DREB         int64
REB          int64
AST          int64
STL          int64
BLK          int64
TOV          int64
PF           int64
+/-          int64
dtype: object
TEAM        object
DATE        object
MATCHUP     object
W/L         object
MIN          int64
PTS          int64
FGM          int64
FGA          int64
FG%        float64
3PM          int64
3PA          int64
3P%        float64
FTM          int64
FTA          int64
FT%        float64
OREB         int64
DREB         int64
REB          int64
AST          int64
STL          int64
BLK          int64
TOV          int64
PF           int64
+/-          int64
dtype: object


In [5]:
# Here I join the 2 datasets using concat()

joined_df = pd.concat([reg_df,playoff_df], ignore_index=True)

joined_df.head(3)

Unnamed: 0,TEAM,DATE,MATCHUP,W/L,MIN,PTS,FGM,FGA,FG%,3PM,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,PF,+/-
0,TOR,01/13/2019,TOR @ WAS,W,290,140,49,104,47.1,13,...,80.6,17,42,59,24,16,8,21,27,2
1,TOR,11/29/2018,TOR vs. GSW,W,265,131,47,90,52.2,15,...,91.7,7,30,37,25,5,6,14,23,3
2,TOR,02/13/2019,TOR vs. WAS,W,241,129,44,92,47.8,16,...,83.3,9,43,52,32,8,8,14,27,9


In [None]:
# Clean our Data

In [6]:
# Drop columns we are not using 
dropped_df = joined_df.drop(["DATE", "TEAM","MATCHUP","+/-"], axis=1)

dropped_df.head(3)

Unnamed: 0,W/L,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,PF
0,W,290,140,49,104,47.1,13,36,36.1,29,36,80.6,17,42,59,24,16,8,21,27
1,W,265,131,47,90,52.2,15,38,39.5,22,24,91.7,7,30,37,25,5,6,14,23
2,W,241,129,44,92,47.8,16,38,42.1,25,30,83.3,9,43,52,32,8,8,14,27


In [7]:
dropped_df.dtypes

W/L      object
MIN       int64
PTS       int64
FGM       int64
FGA       int64
FG%     float64
3PM       int64
3PA       int64
3P%     float64
FTM       int64
FTA       int64
FT%     float64
OREB      int64
DREB      int64
REB       int64
AST       int64
STL       int64
BLK       int64
TOV       int64
PF        int64
dtype: object

In [8]:
# Count total wins and losses
dropped_df["W/L"].value_counts()

W    74
L    32
Name: W/L, dtype: int64

In [9]:
# Convert W/L column 
# Make W = 0 and L = 1

dropped_df.loc[dropped_df['W/L'] == "W", "W/L"] = 0
dropped_df.loc[dropped_df['W/L'] == "L", "W/L"] = 1

dropped_df["W/L"] = pd.to_numeric(dropped_df["W/L"])
    
dropped_df["W/L"].value_counts()

0    74
1    32
Name: W/L, dtype: int64

In [10]:
dropped_df.dtypes

W/L       int64
MIN       int64
PTS       int64
FGM       int64
FGA       int64
FG%     float64
3PM       int64
3PA       int64
3P%     float64
FTM       int64
FTA       int64
FT%     float64
OREB      int64
DREB      int64
REB       int64
AST       int64
STL       int64
BLK       int64
TOV       int64
PF        int64
dtype: object

In [11]:
# Drop the null columns and rows
cleaned_df = dropped_df.dropna(axis='columns', how='all')


cleaned_df = cleaned_df.dropna()

cleaned_df.head(3)

Unnamed: 0,W/L,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,PF
0,0,290,140,49,104,47.1,13,36,36.1,29,36,80.6,17,42,59,24,16,8,21,27
1,0,265,131,47,90,52.2,15,38,39.5,22,24,91.7,7,30,37,25,5,6,14,23
2,0,241,129,44,92,47.8,16,38,42.1,25,30,83.3,9,43,52,32,8,8,14,27


In [None]:
# Data Seperate into Testing and Training

In [12]:
# Create the features

X = cleaned_df.copy()
X = X.drop("W/L", axis = 1)

X.head(3)


Unnamed: 0,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,PF
0,290,140,49,104,47.1,13,36,36.1,29,36,80.6,17,42,59,24,16,8,21,27
1,265,131,47,90,52.2,15,38,39.5,22,24,91.7,7,30,37,25,5,6,14,23
2,241,129,44,92,47.8,16,38,42.1,25,30,83.3,9,43,52,32,8,8,14,27


In [14]:
# Create our target

target = ["W/L"]

y = cleaned_df[target]
y[:5]

X.describe()

Unnamed: 0,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,PF
count,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0
mean,242.349057,112.660377,41.169811,88.103774,46.934906,12.273585,33.943396,36.039623,18.04717,22.283019,81.386792,9.292453,35.254717,44.54717,24.773585,8.235849,5.160377,12.915094,21.386792
std,8.866247,11.018501,4.705562,6.883976,5.795648,3.511974,5.366262,8.347383,5.677662,6.97445,9.092614,3.799667,4.612739,5.648653,5.36884,3.302386,2.398955,3.532483,3.95106
min,238.0,86.0,28.0,69.0,29.5,4.0,20.0,20.0,6.0,7.0,58.8,2.0,25.0,31.0,13.0,1.0,0.0,6.0,13.0
25%,239.0,105.0,38.0,84.0,42.7,10.0,30.0,29.0,13.25,17.0,76.375,6.0,32.25,41.0,21.0,6.0,4.0,10.0,19.0
50%,240.0,114.0,41.0,88.5,47.35,12.0,34.0,35.8,18.0,22.0,81.55,8.0,35.0,44.0,25.0,8.0,5.0,13.0,21.0
75%,241.0,121.75,44.0,92.0,51.575,15.0,38.0,41.45,22.0,27.0,87.3,12.0,38.0,49.0,29.0,10.0,6.0,15.0,24.0
max,290.0,140.0,53.0,105.0,60.9,21.0,47.0,58.1,32.0,40.0,100.0,18.0,47.0,59.0,36.0,19.0,12.0,23.0,30.0


In [15]:
y["W/L"].value_counts()

0    74
1    32
Name: W/L, dtype: int64

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
# OVERSAMPLING

In [None]:
# NAIVE RANDOM OVERSAMPLING

In [18]:
# RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'W/L': 1})

In [19]:
# Resample
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [20]:
# Accuracy score
y_pred = model.predict(X_test)

from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.8583333333333334

In [22]:
# confusion matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(
    cm, index = ["Actual Wins", "Actual Losses"], columns=["Predicted Wins", "Predicted Losses"])
cm_df

Unnamed: 0,Predicted Wins,Predicted Losses
Actual Wins,12,3
Actual Losses,1,11


In [23]:
# Report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.92      0.80      0.92      0.86      0.86      0.72        15
          1       0.79      0.92      0.80      0.85      0.86      0.74        12

avg / total       0.86      0.85      0.86      0.85      0.86      0.73        27



In [None]:
# SMOTE

In [24]:
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(
    X_train, y_train
)
Counter(y_resampled)

Counter({'W/L': 1})

In [25]:
# Resample
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [26]:
# Accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.8

In [27]:
# Confusion matrix

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(
    cm, index = ["Actual Wins", "Actual Losses"], columns=["Predicted Wins", "Predicted Losses"])
cm_df

Unnamed: 0,Predicted Wins,Predicted Losses
Actual Wins,14,1
Actual Losses,4,8


In [29]:
# Report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.78      0.93      0.67      0.85      0.79      0.64        15
          1       0.89      0.67      0.93      0.76      0.79      0.61        12

avg / total       0.83      0.81      0.79      0.81      0.79      0.62        27



In [30]:
# UNDERSAMPLING

In [31]:
# ClusterCentroids
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'W/L': 1})

In [32]:
# Resample
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [33]:
# Accuracy score
y_pred = model.predict(X_test)

from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.8500000000000001

In [34]:
# Confusion matrix

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(
    cm, index = ["Actual Wins", "Actual Losses"], columns=["Predicted Wins", "Predicted Losses"])
cm_df

Unnamed: 0,Predicted Wins,Predicted Losses
Actual Wins,13,2
Actual Losses,2,10


In [35]:
# Report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.87      0.87      0.83      0.87      0.85      0.72        15
          1       0.83      0.83      0.87      0.83      0.85      0.72        12

avg / total       0.85      0.85      0.85      0.85      0.85      0.72        27



In [36]:
#SMOTTEN
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

Counter({'W/L': 1})

In [37]:
# Resample

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [38]:
# Accuracy score
y_pred = model.predict(X_test)

from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.8

In [39]:
# Confusion matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(
    cm, index = ["Actual Wins", "Actual Losses"], columns=["Predicted Wins", "Predicted Losses"])
cm_df

Unnamed: 0,Predicted Wins,Predicted Losses
Actual Wins,9,6
Actual Losses,0,12


In [40]:
# Report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.60      1.00      0.75      0.77      0.58        15
          1       0.67      1.00      0.60      0.80      0.77      0.62        12

avg / total       0.85      0.78      0.82      0.77      0.77      0.60        27

