In [3]:
# Initial imports
import pandas as pd
import numpy as np

In [4]:
# Load the nba_hof_rookies dataset.
file_path = "../SourceData/nba_hof_rookies.csv"
rookies_df = pd.read_csv(file_path)
rookies_df.head()

Unnamed: 0,Name,Hall of Fame Class,Year Drafted,GP,MIN,PTS,FGM,FGA,FG%,3P Made,...,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,EFF
0,A.C. Green,,1985,82,18.8,6.4,2.5,4.7,53.9,0.0,...,2.0,61.1,2.0,2.7,4.6,0.7,0.6,0.6,1.2,8.7
1,A.J. English,,1990,70,20.6,8.8,3.6,8.2,43.9,0.0,...,2.2,70.7,0.9,1.2,2.1,2.5,0.4,0.2,1.6,7.1
2,A.J. Price,,2009,56,15.4,7.3,2.6,6.3,41.0,1.1,...,1.3,80.0,0.2,1.4,1.6,1.9,0.6,0.1,1.1,6.4
3,Aaron Brooks,,2007,51,11.9,5.2,1.8,4.4,41.3,0.7,...,1.0,85.7,0.3,0.8,1.1,1.7,0.3,0.1,0.9,4.7
4,Aaron Gordon,,2014,47,17.0,5.2,2.0,4.4,44.7,0.3,...,1.3,72.1,1.0,2.6,3.6,0.7,0.5,0.5,0.8,6.8


In [5]:
# Check datatypes
rookies_df.dtypes

Name                   object
Hall of Fame Class    float64
Year Drafted            int64
GP                      int64
MIN                   float64
PTS                   float64
FGM                   float64
FGA                   float64
FG%                   float64
3P Made               float64
3PA                   float64
3P%                    object
FTM                   float64
FTA                   float64
FT%                   float64
OREB                  float64
DREB                  float64
REB                   float64
AST                   float64
STL                   float64
BLK                   float64
TOV                   float64
EFF                   float64
dtype: object

In [6]:
# Remove Spaces from Column Names
rookies_df.rename(
    columns={"Hall of Fame Class":"HallofFameClass","Year Drafted":"YearDrafted","3P Made":"3P_Made"}
          ,inplace=True)

In [7]:
# Clean up 3P% Column; Replace '-' with '0'
rookies_df['3P%'] = rookies_df['3P%'].replace({'-':'0'})
rookies_df.dtypes

Name                object
HallofFameClass    float64
YearDrafted          int64
GP                   int64
MIN                float64
PTS                float64
FGM                float64
FGA                float64
FG%                float64
3P_Made            float64
3PA                float64
3P%                 object
FTM                float64
FTA                float64
FT%                float64
OREB               float64
DREB               float64
REB                float64
AST                float64
STL                float64
BLK                float64
TOV                float64
EFF                float64
dtype: object

In [8]:
# Convert 3P% column to Numeric
#df["2nd"] = pd.to_numeric(df["2nd"])
rookies_df['3P%'] = pd.to_numeric(rookies_df['3P%'])
rookies_df.dtypes

Name                object
HallofFameClass    float64
YearDrafted          int64
GP                   int64
MIN                float64
PTS                float64
FGM                float64
FGA                float64
FG%                float64
3P_Made            float64
3PA                float64
3P%                float64
FTM                float64
FTA                float64
FT%                float64
OREB               float64
DREB               float64
REB                float64
AST                float64
STL                float64
BLK                float64
TOV                float64
EFF                float64
dtype: object

In [9]:
# Copy Hall of Fame Class column to preserve induction year and allow for normalization
rookies_df['HallOfFameStatus'] = rookies_df ['HallofFameClass']
rookies_df.head()

Unnamed: 0,Name,HallofFameClass,YearDrafted,GP,MIN,PTS,FGM,FGA,FG%,3P_Made,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,EFF,HallOfFameStatus
0,A.C. Green,,1985,82,18.8,6.4,2.5,4.7,53.9,0.0,...,61.1,2.0,2.7,4.6,0.7,0.6,0.6,1.2,8.7,
1,A.J. English,,1990,70,20.6,8.8,3.6,8.2,43.9,0.0,...,70.7,0.9,1.2,2.1,2.5,0.4,0.2,1.6,7.1,
2,A.J. Price,,2009,56,15.4,7.3,2.6,6.3,41.0,1.1,...,80.0,0.2,1.4,1.6,1.9,0.6,0.1,1.1,6.4,
3,Aaron Brooks,,2007,51,11.9,5.2,1.8,4.4,41.3,0.7,...,85.7,0.3,0.8,1.1,1.7,0.3,0.1,0.9,4.7,
4,Aaron Gordon,,2014,47,17.0,5.2,2.0,4.4,44.7,0.3,...,72.1,1.0,2.6,3.6,0.7,0.5,0.5,0.8,6.8,


In [10]:
# Normalize Hall of Fame Class Column; Replace 'NaN' with 'Not Inducted'

#replace NaN values in 'points' and 'rebounds' columns with 'none'
#df[['points', 'rebounds']] = df[['points', 'rebounds']].fillna('none')

rookies_df['HallOfFameStatus'] = rookies_df['HallOfFameStatus'].fillna('Not Inducted')

rookies_df['HallOfFameStatus'] = np.where(rookies_df['HallOfFameStatus'] == "Not Inducted", "Not Inducted", "Hall of Fame Member")
rookies_df.head(50)

Unnamed: 0,Name,HallofFameClass,YearDrafted,GP,MIN,PTS,FGM,FGA,FG%,3P_Made,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,EFF,HallOfFameStatus
0,A.C. Green,,1985,82,18.8,6.4,2.5,4.7,53.9,0.0,...,61.1,2.0,2.7,4.6,0.7,0.6,0.6,1.2,8.7,Not Inducted
1,A.J. English,,1990,70,20.6,8.8,3.6,8.2,43.9,0.0,...,70.7,0.9,1.2,2.1,2.5,0.4,0.2,1.6,7.1,Not Inducted
2,A.J. Price,,2009,56,15.4,7.3,2.6,6.3,41.0,1.1,...,80.0,0.2,1.4,1.6,1.9,0.6,0.1,1.1,6.4,Not Inducted
3,Aaron Brooks,,2007,51,11.9,5.2,1.8,4.4,41.3,0.7,...,85.7,0.3,0.8,1.1,1.7,0.3,0.1,0.9,4.7,Not Inducted
4,Aaron Gordon,,2014,47,17.0,5.2,2.0,4.4,44.7,0.3,...,72.1,1.0,2.6,3.6,0.7,0.5,0.5,0.8,6.8,Not Inducted
5,Aaron Gray,,2007,61,10.0,4.3,1.7,3.3,50.5,0.0,...,56.6,1.1,1.6,2.8,0.7,0.3,0.3,0.9,5.0,Not Inducted
6,Aaron McKie,,1994,45,18.4,6.5,2.6,5.8,44.4,0.2,...,68.5,0.8,2.1,2.9,2.0,0.8,0.4,0.9,7.9,Not Inducted
7,Acie Earl,,1993,74,15.5,5.5,2.0,5.0,40.6,0.0,...,67.5,1.1,2.2,3.3,0.2,0.3,0.7,1.0,5.4,Not Inducted
8,Acie Law,,2007,56,15.4,4.2,1.7,4.2,40.1,0.1,...,79.2,0.2,0.8,1.0,2.0,0.5,0.0,1.0,4.0,Not Inducted
9,Adam Harrington,,2002,19,5.8,1.6,0.6,1.9,29.7,0.3,...,75.0,0.1,0.4,0.4,0.6,0.1,0.1,0.1,1.3,Not Inducted


In [11]:
# Create Unique Identifier Field - Concatination of Name and Draft Year
# Converted to All Upper Case, All Punctuation and Spaces Stripped
rookies_df['GUID'] = rookies_df['Name'] + rookies_df['YearDrafted'].astype(str)

# Strip out Single Quote/Apostrophe Characters, Commas, and Periods
rookies_df['GUID'] = rookies_df['GUID'].str.replace(r'[^\w\s]+', '')

# Strip out spaces
rookies_df['GUID'].replace('\s+', '', regex=True,inplace=True)


#rookies_df['GUID'] = rookies_df['GUID'].astype(str)
#rookies_df['GUID'].to_string()

#Convert to Upper Case
rookies_df['GUID'] = rookies_df['GUID'].str.upper()

rookies_df.head(100)

  


Unnamed: 0,Name,HallofFameClass,YearDrafted,GP,MIN,PTS,FGM,FGA,FG%,3P_Made,...,OREB,DREB,REB,AST,STL,BLK,TOV,EFF,HallOfFameStatus,GUID
0,A.C. Green,,1985,82,18.8,6.4,2.5,4.7,53.9,0.0,...,2.0,2.7,4.6,0.7,0.6,0.6,1.2,8.7,Not Inducted,ACGREEN1985
1,A.J. English,,1990,70,20.6,8.8,3.6,8.2,43.9,0.0,...,0.9,1.2,2.1,2.5,0.4,0.2,1.6,7.1,Not Inducted,AJENGLISH1990
2,A.J. Price,,2009,56,15.4,7.3,2.6,6.3,41.0,1.1,...,0.2,1.4,1.6,1.9,0.6,0.1,1.1,6.4,Not Inducted,AJPRICE2009
3,Aaron Brooks,,2007,51,11.9,5.2,1.8,4.4,41.3,0.7,...,0.3,0.8,1.1,1.7,0.3,0.1,0.9,4.7,Not Inducted,AARONBROOKS2007
4,Aaron Gordon,,2014,47,17.0,5.2,2.0,4.4,44.7,0.3,...,1.0,2.6,3.6,0.7,0.5,0.5,0.8,6.8,Not Inducted,AARONGORDON2014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Austin Daye,,2009,69,13.3,5.1,2.0,4.3,46.4,0.4,...,0.5,2.0,2.5,0.5,0.4,0.4,0.7,5.7,Not Inducted,AUSTINDAYE2009
96,Austin Rivers,,2012,61,23.2,6.2,2.4,6.4,37.2,0.5,...,0.3,1.5,1.8,2.1,0.4,0.1,1.2,4.6,Not Inducted,AUSTINRIVERS2012
97,Avery Johnson,,1988,43,6.8,1.6,0.7,1.9,34.9,0.0,...,0.3,0.3,0.6,1.7,0.5,0.1,0.4,2.6,Not Inducted,AVERYJOHNSON1988
98,B.J. Armstrong,,1989,81,15.9,5.6,2.3,4.8,48.5,0.0,...,0.2,1.0,1.3,2.5,0.6,0.1,1.0,6.3,Not Inducted,BJARMSTRONG1989


In [12]:
# Set new index and drop existing index
rookies_df.set_index('GUID')

Unnamed: 0_level_0,Name,HallofFameClass,YearDrafted,GP,MIN,PTS,FGM,FGA,FG%,3P_Made,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,EFF,HallOfFameStatus
GUID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACGREEN1985,A.C. Green,,1985,82,18.8,6.4,2.5,4.7,53.9,0.0,...,61.1,2.0,2.7,4.6,0.7,0.6,0.6,1.2,8.7,Not Inducted
AJENGLISH1990,A.J. English,,1990,70,20.6,8.8,3.6,8.2,43.9,0.0,...,70.7,0.9,1.2,2.1,2.5,0.4,0.2,1.6,7.1,Not Inducted
AJPRICE2009,A.J. Price,,2009,56,15.4,7.3,2.6,6.3,41.0,1.1,...,80.0,0.2,1.4,1.6,1.9,0.6,0.1,1.1,6.4,Not Inducted
AARONBROOKS2007,Aaron Brooks,,2007,51,11.9,5.2,1.8,4.4,41.3,0.7,...,85.7,0.3,0.8,1.1,1.7,0.3,0.1,0.9,4.7,Not Inducted
AARONGORDON2014,Aaron Gordon,,2014,47,17.0,5.2,2.0,4.4,44.7,0.3,...,72.1,1.0,2.6,3.6,0.7,0.5,0.5,0.8,6.8,Not Inducted
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZARKOCABARKAPA2003,Zarko Cabarkapa,,2003,49,11.6,4.1,1.7,4.0,41.1,0.1,...,66.0,0.5,1.5,2.0,0.8,0.2,0.3,1.1,3.6,Not Inducted
ZAZAPACHULIA2003,Zaza Pachulia,,2003,59,11.3,3.3,1.2,3.0,38.9,0.0,...,64.4,1.2,1.8,2.9,0.2,0.4,0.2,0.6,4.1,Not Inducted
ZELJKOREBRACA2001,Zeljko Rebraca,,2001,74,15.9,6.9,2.6,5.1,50.5,0.0,...,77.1,1.1,2.8,3.9,0.5,0.4,1.0,1.1,8.6,Not Inducted
ZORANPLANINIC2003,Zoran Planinic,,2003,49,9.7,3.1,1.1,2.6,41.1,0.2,...,63.3,0.3,0.8,1.1,1.4,0.3,0.1,0.7,3.2,Not Inducted


In [13]:
# Identify incomplete rows
rookies_df.count()

Name                1537
HallofFameClass       42
YearDrafted         1537
GP                  1537
MIN                 1537
PTS                 1537
FGM                 1537
FGA                 1537
FG%                 1537
3P_Made             1537
3PA                 1537
3P%                 1537
FTM                 1537
FTA                 1537
FT%                 1537
OREB                1537
DREB                1537
REB                 1537
AST                 1537
STL                 1537
BLK                 1537
TOV                 1537
EFF                 1537
HallOfFameStatus    1537
GUID                1537
dtype: int64

In [14]:
# Check datatypes
rookies_df.dtypes

Name                 object
HallofFameClass     float64
YearDrafted           int64
GP                    int64
MIN                 float64
PTS                 float64
FGM                 float64
FGA                 float64
FG%                 float64
3P_Made             float64
3PA                 float64
3P%                 float64
FTM                 float64
FTA                 float64
FT%                 float64
OREB                float64
DREB                float64
REB                 float64
AST                 float64
STL                 float64
BLK                 float64
TOV                 float64
EFF                 float64
HallOfFameStatus     object
GUID                 object
dtype: object

In [15]:
# Export to CSV
#export_path = "Resources/rookies_df.csv"
#rookies_df.to_csv(export_path)

In [16]:
# import the necessary packages
import psycopg2
from sqlalchemy import create_engine
  
# Create the engine to connect to the database 
engine = create_engine(
    '')

# Write data into the table in database
rookies_df.to_sql('new_rookies', engine)