In [32]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statistics 
import sklearn.metrics as metrics
import glob

from functools   import reduce
from collections import Counter

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest,chi2

from sklearn.linear_model          import LogisticRegression, SGDClassifier, PassiveAggressiveClassifier
from sklearn.neighbors             import KNeighborsClassifier
from sklearn.svm                   import SVC
from sklearn.tree                  import DecisionTreeClassifier
from sklearn.ensemble              import RandomForestClassifier,VotingClassifier,GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix,plot_confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

from imblearn.over_sampling import SMOTE

import seaborn as sns
%matplotlib inline



In [24]:
season_stats = pd.read_csv('nba-players-stats/Seasons_Stats.csv', index_col=0)

In [33]:
def hof_column_transformer(df):
    hof_container = df.copy()
    hof_container['Player'] = hof_container['Player'].str.replace("*", "_hof")
    hof_container = hof_container[hof_container['Player'].str.contains("_hof")]['Player'].str.replace("_hof", "*").unique()
    df['isHoF'] = ''
    df['isHoF'] = np.where(df['Player'].isin(hof_container), 1,0)
    df['Player'] = df['Player'].str.replace("*", "")
    return df

def div_by_game_and_rename_column(df):
    df[countable_columns]  = df[countable_columns] .div(df['G'].values,axis=0)
    df.rename(columns={'MP':'MP/G','2P':'2P/G','3P':'3P/G','FT':'FT/G'
                                         ,'ORB':'ORB/G','DRB':'DRB/G','AST':'AST/G','STL':'STL/G'
                                         ,'BLK':'BLK/G','TOV':'TOV/G','PTS':'PTS/G'}, inplace=True)
    df.drop(['G'], axis=1, inplace=True)
    return df

def group_by_career_average_and_normalize_percentiles(df):
    df = df.groupby('Player').mean()
    df = df.round(2).reset_index()
    df[['2P%', '3P%', 'FT%']] = df[['2P%', '3P%', 'FT%']] * 100
    return df

def filter_by_30_games(df):
    df.query('G >= 30')
    return df

In [34]:
countable_columns = ['MP', '2P','3P', 'FT','ORB','DRB','AST','STL','BLK','TOV','PTS']
independent_features = ['Player', 'Pos', 'G','MP', '2P', '2P%', '3P', '3P%', 'FT', 'FT%', 'USG%', 'ORB', 'DRB', 'AST', 'AST%', 'STL', 'STL%', 'BLK', 'BLK%', 'TOV', 'PTS', 'isHoF']

In [35]:
## Data Filter / Data Clean / Hall of Fame column

## Players past 1980 and minimum number of games > 30 are used for evaluation
season_stats_filtered = season_stats.copy()
season_stats_filtered = season_stats.query('Year >= 1980')
season_stats_filtered = filter_by_30_games(season_stats_filtered)

## Removed columns with no predictive power
season_stats_filtered.drop(['GS','blanl','blank2','blank2','Age', 'Tm'], axis=1, inplace=True)

## All the null values are handled - these values are null because there is 0 sample for the percentiles - thus 0.
season_stats_filtered.fillna(0, inplace=True)
season_stats_filtered = hof_column_transformer(season_stats_filtered)

season_stats_filtered = season_stats_filtered[independent_features]
season_stats_filtered = div_by_game_and_rename_column(season_stats_filtered)

season_stats_filtered = group_by_career_average_and_normalize_percentiles(season_stats_filtered)

In [41]:
norm_tables = []
adv_tables = []
filtered_tables = []

for filepath in glob.iglob('recent-nba-players-stats/*.csv'):
    df = pd.read_csv(filepath, index_col=0)  
    df = filter_by_30_games(df)
    if '_adv' in filepath:
        df = df[['Player', 'Pos', 'USG%', 'AST%', 'STL%', 'BLK%']]
        df.fillna(0, inplace=True)
        df = df.groupby('Player').mean()
        df = df.round(2).reset_index()
        adv_tables.append(df)       
    else:
        df = df[['Player', 'Pos', 'G', 'MP', '3P', '3P%', '2P','2P%','FT', 'FT%', 
                 'ORB', 'DRB',  'AST', 'STL', 'BLK', 'TOV', 'PTS']]
        df.fillna(0, inplace=True)
        df = group_by_career_average_and_normalize_percentiles(df)
        norm_tables.append(df)      

for n_df, adv_df in zip(norm_tables, adv_tables):
    merged_df = pd.merge(n_df, adv_df, on="Player")  
    merged_df = div_by_game_and_rename_column(merged_df)
    merged_df = merged_df.round(2)
    merged_df['Player'] = merged_df['Player'].str.split(r"\\|=", expand=True)
    merged_df = merged_df.reindex(columns=season_stats_filtered.columns)
    filtered_tables.append(merged_df)

joined_df = pd.concat([filtered_tables[0],filtered_tables[1],filtered_tables[2],
                       filtered_tables[3],filtered_tables[4]], axis=0)
joined_df = joined_df.groupby('Player').mean().round(2).reset_index()

Unnamed: 0,Player,MP/G,2P/G,2P%,3P/G,3P%,FT/G,FT%,USG%,ORB/G,DRB/G,AST/G,AST%,STL/G,STL%,BLK/G,BLK%,TOV/G,PTS/G,isHoF
0,A.C. Green,28.26,3.34,50.0,0.08,15.0,2.29,71.0,14.41,2.57,4.76,1.05,5.22,0.80,1.44,0.39,0.85,1.08,9.23,0.0
1,A.J. Bramlett,7.62,0.50,19.0,0.00,0.0,0.00,0.0,17.10,1.50,1.25,0.00,0.00,0.12,0.80,0.00,0.00,0.38,1.00,0.0
2,A.J. English,20.58,3.99,45.0,0.06,14.0,1.71,77.0,23.70,0.93,1.16,2.15,15.85,0.38,0.90,0.16,0.45,1.36,9.87,0.0
3,A.J. Guyton,12.36,0.83,25.0,0.61,26.0,0.31,55.0,20.77,0.19,0.50,1.58,23.47,0.33,2.47,0.10,0.50,0.68,3.81,0.0
4,A.J. Hammons,7.41,0.55,38.0,0.23,50.0,0.41,45.0,17.60,0.36,1.27,0.18,3.80,0.05,0.30,0.59,7.20,0.45,2.18,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2828,Zeljko Rebraca,14.48,2.25,51.0,0.00,0.0,1.11,75.0,18.84,0.93,1.94,0.38,4.70,0.20,0.80,0.64,3.26,0.92,5.61,0.0
2829,Zendon Hamilton,7.19,0.78,37.0,0.00,0.0,1.35,68.0,24.72,0.76,1.29,0.11,1.62,0.35,4.19,0.08,0.48,0.56,2.91,0.0
2830,Zoran Dragic,4.35,0.48,54.0,0.16,18.0,0.21,59.0,25.17,0.32,0.18,0.29,11.37,0.11,1.03,0.00,0.00,0.27,1.65,0.0
2831,Zoran Planinic,10.76,1.10,45.0,0.25,30.0,0.89,68.0,20.47,0.33,1.02,1.12,18.23,0.40,1.97,0.04,0.33,0.82,3.86,0.0
