In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf
from sqlalchemy import create_engine
import psycopg2
from config import db_password
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

### Establishing connection to the database

In [2]:
# Establishing connection to the database (test)

db_string = f"postgres://postgres:{db_password}@127.0.0.1:5432/nba_data"
    
conn = create_engine(db_string)

In [3]:
# Read in dataframe
df = pd.read_csv("basketball_data.csv")
df.head()

#ETL -- extract transform load
df.columns = [i.replace("%", "") for i in df.columns]
df.drop("Unnamed: 0", inplace =True, axis =1)


### Exporting data to database

In [4]:
# Exporting dataframe to SQL (test)

#df = pd.read_csv("basketball_data.csv")

df.to_sql("basketball_data", con = conn, index=False, if_exists="replace")

### Importing data from database

In [5]:
# Importing dataframe from SQL (test)

from_sql_df = pd.read_sql("basketball_data", con = conn)

In [6]:
# Reading the dataframe
from_sql_df.head(10)

Unnamed: 0,RK,Name,POS,GP,MIN,PTS,FGM,FGA,FG,3PM,...,FTA,FT,REB,AST,STL,BLK,TO,DD2,TD3,PER
0,1,Jrue HolidayMIL,PG,23,32.5,16.4,6.5,13.0,50.2,1.9,...,1.9,79.1,4.8,5.4,1.9,0.6,1.6,0,0,19.99
1,2,Kawhi LeonardLAC,SF,23,34.4,26.7,9.7,19.0,51.3,1.9,...,6.1,87.9,5.9,5.0,1.8,0.6,1.8,4,0,27.63
2,3,T.J. McConnellIND,PG,26,24.0,5.8,2.7,5.3,50.4,0.2,...,0.5,33.3,3.3,6.6,1.7,0.4,1.7,0,0,15.22
3,4,Fred VanVleetTOR,SG,29,36.6,20.3,6.9,16.6,41.5,3.3,...,3.7,89.6,4.2,6.7,1.7,0.7,2.1,3,0,18.16
4,5,Ben SimmonsPHI,PG,25,33.4,15.2,5.8,10.4,56.2,0.0,...,5.3,66.7,8.3,8.0,1.7,0.8,3.4,11,3,19.92
5,6,Robert CovingtonPOR,PF,26,31.4,7.6,2.7,7.2,37.1,1.7,...,0.7,88.9,6.2,1.8,1.6,0.9,1.0,1,0,9.97
6,7,Andre DrummondCLE,C,25,28.9,17.5,7.2,15.2,47.4,0.0,...,5.2,59.7,13.5,2.6,1.6,1.2,3.2,18,0,20.94
7,8,Matisse ThybullePHI,SG,26,17.7,3.3,1.2,3.2,38.6,0.6,...,0.5,50.0,1.3,0.8,1.6,0.8,0.3,0,0,9.13
8,9,Nikola JokicDEN,C,28,35.9,27.4,10.5,18.4,57.0,1.5,...,5.5,87.1,11.1,8.6,1.6,0.6,3.4,25,6,31.48
9,10,Dejounte MurraySA,PG,27,30.4,15.1,6.2,13.9,44.7,1.0,...,2.1,81.0,7.2,5.0,1.6,0.1,1.7,8,2,16.45


In [7]:
df.dtypes

RK        int64
Name     object
POS      object
GP        int64
MIN     float64
PTS     float64
FGM     float64
FGA     float64
FG      float64
3PM     float64
3PA     float64
3P      float64
FTM     float64
FTA     float64
FT      float64
REB     float64
AST     float64
STL     float64
BLK     float64
TO      float64
DD2       int64
TD3       int64
PER     float64
dtype: object

In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df2 = df.copy()
df2['POS'] = le.fit_transform(df2['POS'])
df2.head(10)

Unnamed: 0,RK,Name,POS,GP,MIN,PTS,FGM,FGA,FG,3PM,...,FTA,FT,REB,AST,STL,BLK,TO,DD2,TD3,PER
0,1,Jrue HolidayMIL,2,23,32.5,16.4,6.5,13.0,50.2,1.9,...,1.9,79.1,4.8,5.4,1.9,0.6,1.6,0,0,19.99
1,2,Kawhi LeonardLAC,3,23,34.4,26.7,9.7,19.0,51.3,1.9,...,6.1,87.9,5.9,5.0,1.8,0.6,1.8,4,0,27.63
2,3,T.J. McConnellIND,2,26,24.0,5.8,2.7,5.3,50.4,0.2,...,0.5,33.3,3.3,6.6,1.7,0.4,1.7,0,0,15.22
3,4,Fred VanVleetTOR,4,29,36.6,20.3,6.9,16.6,41.5,3.3,...,3.7,89.6,4.2,6.7,1.7,0.7,2.1,3,0,18.16
4,5,Ben SimmonsPHI,2,25,33.4,15.2,5.8,10.4,56.2,0.0,...,5.3,66.7,8.3,8.0,1.7,0.8,3.4,11,3,19.92
5,6,Robert CovingtonPOR,1,26,31.4,7.6,2.7,7.2,37.1,1.7,...,0.7,88.9,6.2,1.8,1.6,0.9,1.0,1,0,9.97
6,7,Andre DrummondCLE,0,25,28.9,17.5,7.2,15.2,47.4,0.0,...,5.2,59.7,13.5,2.6,1.6,1.2,3.2,18,0,20.94
7,8,Matisse ThybullePHI,4,26,17.7,3.3,1.2,3.2,38.6,0.6,...,0.5,50.0,1.3,0.8,1.6,0.8,0.3,0,0,9.13
8,9,Nikola JokicDEN,0,28,35.9,27.4,10.5,18.4,57.0,1.5,...,5.5,87.1,11.1,8.6,1.6,0.6,3.4,25,6,31.48
9,10,Dejounte MurraySA,2,27,30.4,15.1,6.2,13.9,44.7,1.0,...,2.1,81.0,7.2,5.0,1.6,0.1,1.7,8,2,16.45


In [9]:
df2 = pd.get_dummies(df2, columns=["Name"])
df2.head()

Unnamed: 0,RK,POS,GP,MIN,PTS,FGM,FGA,FG,3PM,3PA,...,Name_Pascal SiakamTOR,Name_Ricky RubioMIN,Name_Robert CovingtonPOR,Name_Stephen CurryGS,Name_T.J. McConnellIND,Name_Terry RozierCHA,Name_Thaddeus YoungCHI,Name_Tyrese HaliburtonSAC,Name_Tyus JonesMEM,Name_Zach LaVineCHI
0,1,2,23,32.5,16.4,6.5,13.0,50.2,1.9,4.8,...,0,0,0,0,0,0,0,0,0,0
1,2,3,23,34.4,26.7,9.7,19.0,51.3,1.9,4.9,...,0,0,0,0,0,0,0,0,0,0
2,3,2,26,24.0,5.8,2.7,5.3,50.4,0.2,0.6,...,0,0,0,0,1,0,0,0,0,0
3,4,4,29,36.6,20.3,6.9,16.6,41.5,3.3,8.7,...,0,0,0,0,0,0,0,0,0,0
4,5,2,25,33.4,15.2,5.8,10.4,56.2,0.0,0.2,...,0,0,0,0,0,0,0,0,0,0


### Creating a Support-vector machine

In [None]:
y = df2["RK"]
X = df2.drop(columns="RK")

### Split the dataset into Training and Testing sets:

In [None]:
from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X,
#    y, random_state=1, stratify=y)

X_train, X_test, y_train, y_test = train_test_split(X,
    y, test_size=0.33, random_state=42)

In [None]:
from sklearn.svm import SVC
model = SVC(kernel='linear')

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
results = pd.DataFrame({
   "Prediction": y_pred,
   "Actual": y_test
}).reset_index(drop=True)
results.head()

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
# Define the features set.
X = df2.copy()
X = X.drop("RK", axis=1)
X.head()

In [None]:
# Define the target set.
y = df2["RK"].values
y[:5]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5)

In [None]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Splitting into Train and Test sets into an 80/20 split.
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=78, train_size=0.80)

In [None]:
# Determine the shape of our training and testing sets.
print(X_train2.shape)
print(X_test2.shape)
print(y_train2.shape)
print(y_test2.shape)

In [None]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train_scaled, y_train)

In [None]:
# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)

In [None]:
predictions

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

In [10]:
# Random Forest
# Define the features set.
X = df2.copy()
X = X.drop("RK", axis=1)
X.head()

Unnamed: 0,POS,GP,MIN,PTS,FGM,FGA,FG,3PM,3PA,3P,...,Name_Pascal SiakamTOR,Name_Ricky RubioMIN,Name_Robert CovingtonPOR,Name_Stephen CurryGS,Name_T.J. McConnellIND,Name_Terry RozierCHA,Name_Thaddeus YoungCHI,Name_Tyrese HaliburtonSAC,Name_Tyus JonesMEM,Name_Zach LaVineCHI
0,2,23,32.5,16.4,6.5,13.0,50.2,1.9,4.8,38.7,...,0,0,0,0,0,0,0,0,0,0
1,3,23,34.4,26.7,9.7,19.0,51.3,1.9,4.9,38.9,...,0,0,0,0,0,0,0,0,0,0
2,2,26,24.0,5.8,2.7,5.3,50.4,0.2,0.6,37.5,...,0,0,0,0,1,0,0,0,0,0
3,4,29,36.6,20.3,6.9,16.6,41.5,3.3,8.7,37.7,...,0,0,0,0,0,0,0,0,0,0
4,2,25,33.4,15.2,5.8,10.4,56.2,0.0,0.2,16.7,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Define the target set.
y = df2["RK"].ravel()
y[:5]

array([1, 2, 3, 4, 5])

In [12]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)

In [13]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [15]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [16]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [17]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

ValueError: Shape of passed values is (17, 17), indices imply (2, 2)