### Importing dependencies

In [223]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf
from sqlalchemy import create_engine
import psycopg2
from config import db_password
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

### Establishing connection to the database

In [232]:
# Establishing connection to the database (test)

db_string = f"postgres://postgres:{db_password}@127.0.0.1:5432/nba_data"
    
conn = create_engine(db_string)

### Reading individual player stats dataframe

In [233]:
# Read in dataframe
df = pd.read_csv("individual_player_stats.csv", encoding='ISO-8859-1')
# df.to_csv("individual_player_stats_2.csv", index=False)
# df2 = pd.read_csv("individual_player_stats_2.csv")

# df.dropna(inplace=True)
# df.isna().sum()

#ETL -- extract transform load
#df.columns = [i.replace("%", "") for i in df.columns]
# df.drop("Unnamed: 0", inplace =True, axis =1)


UnicodeDecodeError: 'charmap' codec can't decode byte 0xc3 in position 21566: character maps to <undefined>

In [227]:
df.columns

Index(['player', 'pos', 'age', 'team', 'game', 'gamestarted', 'minutesplayed',
       'fieldgoalmade', 'fieldgoalattempt', 'fieldgoalpercentage',
       'threepointmade', 'threepa', 'threepointattempt', 'twopointmade',
       'twopointattempt', 'twopointpercentage', 'efficencyfgpercentage',
       'freethrowsmade', 'freethrowattempt', 'freethrowpercentage',
       'offensiverebound', 'deffensiverebound', 'totalrebound', 'assist',
       'steal', 'block', 'turnover', 'personalfoul', 'points'],
      dtype='object')

In [228]:
# df.columns = [i.lower() for i in df.columns]
# df.head(10)

### Exporting data to database

In [229]:
len(set(df.team.tolist()))

31

In [230]:
df.to_csv("individual_player_stats3.csv", index=False)

In [216]:
# Exporting dataframe to SQL (test)

#df = pd.read_csv("basketball_data.csv")

df.to_sql("individual_player_stats", con = conn, index=False, if_exists="append")

### Importing data from database

In [101]:
# Importing dataframe from SQL (test)

from_sql_df = pd.read_sql("individual_player_stats", con = conn)

In [102]:
# Reading the dataframe
from_sql_df.head(10)

Unnamed: 0,player,pos,age,team,game,gamestarted,minutesplayed,fieldgoalmade,fieldgoalattempt,fieldgoalpercentage,...,freethrowpercentage,offensiverebound,deffensiverebound,totalrebound,assist,steal,block,turnover,personalfoul,points
0,Steven Adams,C,27,NOP,27,27,760,94,156,1,...,0,116,123,239,58,26,15,46,51,217
1,Bam Adebayo,C,23,MIA,27,27,908,198,347,1,...,1,53,199,252,149,25,27,82,69,534
2,LaMarcus Aldridge,C,35,SAS,18,18,480,107,225,0,...,1,15,63,78,35,7,16,16,27,254
3,Nickeil Alexander-Walker,SG,22,NOP,23,3,441,77,188,0,...,1,5,56,61,46,25,8,30,40,203
4,Grayson Allen,SG,25,MEM,19,8,454,60,140,0,...,1,7,48,55,39,19,3,20,24,197
5,Jarrett Allen,C,22,TOT,28,10,734,122,190,1,...,1,82,170,252,45,13,46,43,44,345
6,Jarrett Allen,C,22,CLE,16,5,414,78,125,1,...,1,44,83,127,25,6,27,21,23,211
7,Kyle Anderson,PF,27,MEM,24,24,675,120,257,0,...,1,22,128,150,92,27,18,34,43,330
8,Giannis Antetokounmpo,PF,26,MIL,28,28,944,287,516,1,...,1,48,272,320,165,36,36,104,89,784
9,Thanasis Antetokounmpo,SF,28,MIL,19,0,157,19,32,1,...,1,16,18,34,15,5,3,15,27,45


In [103]:
df.dtypes

player                    object
pos                       object
age                        int64
team                      object
game                       int64
gamestarted                int64
minutesplayed              int64
fieldgoalmade              int64
fieldgoalattempt           int64
fieldgoalpercentage      float64
threepointmade             int64
threepa                    int64
threepointattempt        float64
twopointmade               int64
twopointattempt            int64
twopointpercentage       float64
efficencyfgpercentage    float64
freethrowsmade             int64
freethrowattempt           int64
freethrowpercentage      float64
offensiverebound           int64
deffensiverebound          int64
totalrebound               int64
assist                     int64
steal                      int64
block                      int64
turnover                   int64
personalfoul               int64
points                     int64
dtype: object

In [104]:
# Dropping noisy data
df = df.drop(["player", "pos", "age", "gamestarted", "minutesplayed", "game"], axis=1)

df.head(10)

Unnamed: 0,team,fieldgoalmade,fieldgoalattempt,fieldgoalpercentage,threepointmade,threepa,threepointattempt,twopointmade,twopointattempt,twopointpercentage,...,freethrowpercentage,offensiverebound,deffensiverebound,totalrebound,assist,steal,block,turnover,personalfoul,points
2,NOP,94,156,0.603,0,1,0.0,94,155,0.606,...,0.468,116,123,239,58,26,15,46,51,217
3,MIA,198,347,0.571,2,5,0.4,196,342,0.573,...,0.845,53,199,252,149,25,27,82,69,534
4,SAS,107,225,0.476,24,67,0.358,83,158,0.525,...,0.762,15,63,78,35,7,16,16,27,254
6,NOP,77,188,0.41,24,87,0.276,53,101,0.525,...,0.781,5,56,61,46,25,8,30,40,203
7,MEM,60,140,0.429,44,101,0.436,16,39,0.41,...,0.892,7,48,55,39,19,3,20,24,197
8,TOT,122,190,0.642,1,4,0.25,121,186,0.651,...,0.758,82,170,252,45,13,46,43,44,345
10,CLE,78,125,0.624,1,4,0.25,77,121,0.636,...,0.761,44,83,127,25,6,27,21,23,211
12,MEM,120,257,0.467,40,103,0.388,80,154,0.519,...,0.781,22,128,150,92,27,18,34,43,330
13,MIL,287,516,0.556,31,111,0.279,256,405,0.632,...,0.637,48,272,320,165,36,36,104,89,784
15,MIL,19,32,0.594,2,7,0.286,17,25,0.68,...,0.556,16,18,34,15,5,3,15,27,45


In [106]:
# New df 
new_df = df[["team", "points", "totalrebound", "assist", "block", "steal", "turnover", "offensiverebound", "deffensiverebound"]]

In [107]:
new_df.head(10)

Unnamed: 0,team,points,totalrebound,assist,block,steal,turnover,offensiverebound,deffensiverebound
2,NOP,217,239,58,15,26,46,116,123
3,MIA,534,252,149,27,25,82,53,199
4,SAS,254,78,35,16,7,16,15,63
6,NOP,203,61,46,8,25,30,5,56
7,MEM,197,55,39,3,19,20,7,48
8,TOT,345,252,45,46,13,43,82,170
10,CLE,211,127,25,27,6,21,44,83
12,MEM,330,150,92,18,27,34,22,128
13,MIL,784,320,165,36,36,104,48,272
15,MIL,45,34,15,3,5,15,16,18


In [109]:
df2 = new_df.groupby("team")
df2.head(10)

Unnamed: 0,team,points,totalrebound,assist,block,steal,turnover,offensiverebound,deffensiverebound
2,NOP,217,239,58,15,26,46,116,123
3,MIA,534,252,149,27,25,82,53,199
4,SAS,254,78,35,16,7,16,15,63
6,NOP,203,61,46,8,25,30,5,56
7,MEM,197,55,39,3,19,20,7,48
...,...,...,...,...,...,...,...,...,...
439,BOS,594,161,108,9,29,56,13,148
440,BOS,127,37,46,3,21,25,7,30
441,CHI,230,80,51,13,28,27,13,67
443,BOS,243,133,36,26,15,27,34,99


### Reading iteam abbreviations dataframe

In [134]:
# Read in dataframe
df3 = pd.read_csv("team_abbreviations.csv", encoding='ISO-8859-1')
df3

Unnamed: 0,Team,Franchise
0,ATL,Atlanta Hawks
1,BKN,Brooklyn Nets
2,BOS,Boston Celtics
3,CHA,Charlotte Hornets
4,CHI,Chicago Bulls
5,CLE,Cleveland Cavaliers
6,DAL,Dallas Mavericks
7,DEN,Denver Nuggets
8,DET,Detroit Pistons
9,GSW,Golden State Warriors


In [135]:
df3.columns

Index(['Team', 'Franchise'], dtype='object')

In [136]:
df3.columns = [i.lower() for i in df3.columns]

In [137]:
df3

Unnamed: 0,team,franchise
0,ATL,Atlanta Hawks
1,BKN,Brooklyn Nets
2,BOS,Boston Celtics
3,CHA,Charlotte Hornets
4,CHI,Chicago Bulls
5,CLE,Cleveland Cavaliers
6,DAL,Dallas Mavericks
7,DEN,Denver Nuggets
8,DET,Detroit Pistons
9,GSW,Golden State Warriors


In [138]:
# Exporting dataframe to SQL (test)

#df = pd.read_csv("basketball_data.csv")

df3.to_sql("team_abbreviations", con = conn, index=False, if_exists="append")

In [139]:
# Importing dataframe from SQL (test)

from_sql_df3 = pd.read_sql("team_abbreviations", con = conn)

In [140]:
# Reading the dataframe
from_sql_df3.head(10)

Unnamed: 0,team,franchise
0,ATL,Atlanta Hawks
1,BKN,Brooklyn Nets
2,BOS,Boston Celtics
3,CHA,Charlotte Hornets
4,CHI,Chicago Bulls
5,CLE,Cleveland Cavaliers
6,DAL,Dallas Mavericks
7,DEN,Denver Nuggets
8,DET,Detroit Pistons
9,GSW,Golden State Warriors


In [152]:
# Read in dataframe
df2 = pd.read_csv("nba_season_stats.csv", encoding='ISO-8859-1')
df2.head()

Unnamed: 0,ï»¿Franchise,Conference,Win,Lost,WinPercentage,GamesBehind,ConferenceRecord,DivisionRecord,HomeRecord,RoadRecord,OverTimeRecord,Last10Record,Streak
0,Philadelphia 76ers,Eastern,19,10,0.655,0.0,15-4,5-1,12-2,7-8,1-1,6-4,W 1
1,Brooklyn Nets,Eastern,19,12,0.613,1.0,10-9,3-2,11-5,8-7,1-2,6-4,W 5
2,Milwaukee Bucks,Eastern,16,13,0.552,3.0,12-7,8-1,9-4,7-9,0-0,5-5,L 5
3,Indiana Pacers,Eastern,15,14,0.517,4.0,9-8,3-2,7-9,8-5,3-1,4-6,W 1
4,Boston Celtics,Eastern,14,14,0.5,4.5,10-9,2-4,7-5,7-9,0-0,4-6,L 1


In [153]:
df2.columns

Index(['ï»¿Franchise', 'Conference', 'Win', 'Lost', 'WinPercentage',
       'GamesBehind', 'ConferenceRecord', 'DivisionRecord', 'HomeRecord',
       'RoadRecord', 'OverTimeRecord', 'Last10Record', 'Streak'],
      dtype='object')

In [154]:
df2.columns = [i.lower() for i in df.columns]

ValueError: Length mismatch: Expected axis has 13 elements, new values have 29 elements

In [130]:
# Exporting dataframe to SQL (test)

#df = pd.read_csv("basketball_data.csv")

df2.to_sql("nba_season_stats", con = conn, index=False, if_exists="append")

ProgrammingError: (psycopg2.errors.UndefinedColumn) column "ï»¿Franchise" of relation "nba_season_stats" does not exist
LINE 1: INSERT INTO nba_season_stats ("ï»¿Franchise", "Conference", ...
                                      ^

[SQL: INSERT INTO nba_season_stats ("ï»¿Franchise", "Conference", "Win", "Lost", "WinPercentage", "GamesBehind", "ConferenceRecord", "DivisionRecord", "HomeRecord", "RoadRecord", "OverTimeRecord", "Last10Record", "Streak") VALUES (%(ï»¿Franchise)s, %(Conference)s, %(Win)s, %(Lost)s, %(WinPercentage)s, %(GamesBehind)s, %(ConferenceRecord)s, %(DivisionRecord)s, %(HomeRecord)s, %(RoadRecord)s, %(OverTimeRecord)s, %(Last10Record)s, %(Streak)s)]
[parameters: ({'ï»¿Franchise': 'Philadelphia 76ers', 'Conference': 'Eastern', 'Win': 19, 'Lost': 10, 'WinPercentage': 0.655, 'GamesBehind': 0.0, 'ConferenceRecord': '15-4', 'DivisionRecord': '5-1', 'HomeRecord': '12-2', 'RoadRecord': '7-8', 'OverTimeRecord': '1-1', 'Last10Record': '6-4', 'Streak': 'W 1'}, {'ï»¿Franchise': 'Brooklyn Nets', 'Conference': 'Eastern', 'Win': 19, 'Lost': 12, 'WinPercentage': 0.613, 'GamesBehind': 1.0, 'ConferenceRecord': '10-9', 'DivisionRecord': '3-2', 'HomeRecord': '11-5', 'RoadRecord': '8-7', 'OverTimeRecord': '1-2', 'Last10Record': '6-4', 'Streak': 'W 5'}, {'ï»¿Franchise': 'Milwaukee Bucks', 'Conference': 'Eastern', 'Win': 16, 'Lost': 13, 'WinPercentage': 0.552, 'GamesBehind': 3.0, 'ConferenceRecord': '12-7', 'DivisionRecord': '8-1', 'HomeRecord': '9-4', 'RoadRecord': '7-9', 'OverTimeRecord': '0-0', 'Last10Record': '5-5', 'Streak': 'L 5'}, {'ï»¿Franchise': 'Indiana Pacers', 'Conference': 'Eastern', 'Win': 15, 'Lost': 14, 'WinPercentage': 0.517, 'GamesBehind': 4.0, 'ConferenceRecord': '9-8', 'DivisionRecord': '3-2', 'HomeRecord': '7-9', 'RoadRecord': '8-5', 'OverTimeRecord': '3-1', 'Last10Record': '4-6', 'Streak': 'W 1'}, {'ï»¿Franchise': 'Boston Celtics', 'Conference': 'Eastern', 'Win': 14, 'Lost': 14, 'WinPercentage': 0.5, 'GamesBehind': 4.5, 'ConferenceRecord': '10-9', 'DivisionRecord': '2-4', 'HomeRecord': '7-5', 'RoadRecord': '7-9', 'OverTimeRecord': '0-0', 'Last10Record': '4-6', 'Streak': 'L 1'}, {'ï»¿Franchise': 'Toronto Raptors', 'Conference': 'Eastern', 'Win': 14, 'Lost': 15, 'WinPercentage': 0.483, 'GamesBehind': 5.0, 'ConferenceRecord': '11-7', 'DivisionRecord': '2-3', 'HomeRecord': '6-6', 'RoadRecord': '8-9', 'OverTimeRecord': '0-0', 'Last10Record': '7-3', 'Streak': 'W 2'}, {'ï»¿Franchise': 'New York Knicks', 'Conference': 'Eastern', 'Win': 14, 'Lost': 16, 'WinPercentage': 0.467, 'GamesBehind': 5.5, 'ConferenceRecord': '10-10', 'DivisionRecord': '1-3', 'HomeRecord': '7-6', 'RoadRecord': '7-10', 'OverTimeRecord': '0-0', 'Last10Record': '5-5', 'Streak': 'L 1'}, {'ï»¿Franchise': 'Charlotte Hornets', 'Conference': 'Eastern', 'Win': 13, 'Lost': 15, 'WinPercentage': 0.46399999999999997, 'GamesBehind': 5.5, 'ConferenceRecord': '9-9', 'DivisionRecord': '5-1', 'HomeRecord': '8-8', 'RoadRecord': '5-7', 'OverTimeRecord': '1-1', 'Last10Record': '6-4', 'Streak': 'L 1'}  ... displaying 10 of 30 total bound parameter sets ...  {'ï»¿Franchise': 'Oklahoma City Thunder', 'Conference': 'Western', 'Win': 11, 'Lost': 17, 'WinPercentage': 0.39299999999999996, 'GamesBehind': 12.5, 'ConferenceRecord': '5-14', 'DivisionRecord': '2-5', 'HomeRecord': '4-9', 'RoadRecord': '7-8', 'OverTimeRecord': '1-2', 'Last10Record': '3-7', 'Streak': 'L 2'}, {'ï»¿Franchise': 'Minnesota Timberwolves', 'Conference': 'Western', 'Win': 7, 'Lost': 22, 'WinPercentage': 0.24100000000000002, 'GamesBehind': 17.0, 'ConferenceRecord': '4-14', 'DivisionRecord': '2-4', 'HomeRecord': '4-10', 'RoadRecord': '3-12', 'OverTimeRecord': '0-2', 'Last10Record': '2-8', 'Streak': 'L 2'})]
(Background on this error at: http://sqlalche.me/e/13/f405)

In [61]:
# Importing dataframe from SQL (test)

from_sql_df2 = pd.read_sql("nba_season_stats", con = conn)

In [62]:
# Reading the dataframe
from_sql_df2.head(10)

Unnamed: 0,ï»¿franchise,conference,win,lost,winpercentage,gamesbehind,conferencerecord,divisionrecord,homerecord,roadrecord,overtimerecord,last10record,streak
0,Philadelphia 76ers,Eastern,19,10,0.655,0.0,15-4,5-1,12-2,7-8,1-1,6-4,W 1
1,Brooklyn Nets,Eastern,19,12,0.613,1.0,10-9,3-2,11-5,8-7,1-2,6-4,W 5
2,Milwaukee Bucks,Eastern,16,13,0.552,3.0,12-7,8-1,9-4,7-9,0-0,5-5,L 5
3,Indiana Pacers,Eastern,15,14,0.517,4.0,9-8,3-2,7-9,8-5,3-1,4-6,W 1
4,Boston Celtics,Eastern,14,14,0.5,4.5,10-9,2-4,7-5,7-9,0-0,4-6,L 1
5,Toronto Raptors,Eastern,14,15,0.483,5.0,11-7,2-3,6-6,8-9,0-0,7-3,W 2
6,New York Knicks,Eastern,14,16,0.467,5.5,10-10,1-3,7-6,7-10,0-0,5-5,L 1
7,Charlotte Hornets,Eastern,13,15,0.464,5.5,9-9,5-1,8-8,5-7,1-1,6-4,L 1
8,Chicago Bulls,Eastern,12,15,0.444,6.0,7-7,2-2,5-9,7-6,1-1,5-5,W 2
9,Atlanta Hawks,Eastern,12,16,0.429,6.5,8-9,1-2,6-9,6-7,1-1,3-7,W 1


In [21]:
# Importing dataframe from SQL (test)

from_sql_df = pd.read_sql("Team_abbreviations", con = conn)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df2 = df.copy()
df2['Team'] = le.fit_transform(df2['Team'])
df2.head(10)

In [None]:
df2 = pd.get_dummies(df2, columns=["player"])
df2.head()

### Creating a Support-vector machine

In [None]:
y = df2["RK"]
X = df2.drop(columns="RK")

### Split the dataset into Training and Testing sets:

In [1]:
from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X,
#    y, random_state=1, stratify=y)

X_train, X_test, y_train, y_test = train_test_split(X,
    y, test_size=0.33, random_state=42)

NameError: name 'X' is not defined

In [None]:
from sklearn.svm import SVC
model = SVC(kernel='linear')

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
results = pd.DataFrame({
   "Prediction": y_pred,
   "Actual": y_test
}).reset_index(drop=True)
results.head()

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
# Define the features set.
X = df2.copy()
X = X.drop("RK", axis=1)
X.head()

In [None]:
# Define the target set.
y = df2["RK"].values
y[:5]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5)

In [None]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Splitting into Train and Test sets into an 80/20 split.
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=78, train_size=0.80)

In [None]:
# Determine the shape of our training and testing sets.
print(X_train2.shape)
print(X_test2.shape)
print(y_train2.shape)
print(y_test2.shape)

In [None]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train_scaled, y_train)

In [None]:
# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)

In [None]:
predictions

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

In [None]:
# Random Forest
# Define the features set.
X = df2.copy()
X = X.drop("RK", axis=1)
X.head()

In [None]:
# Define the target set.
y = df2["RK"].ravel()
y[:5]

In [None]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)

In [None]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [None]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [None]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [None]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df