# Feature Engineering

## 01 Missing Values

Dataset source: https://www.kaggle.com/pablote/nba-enhanced-stats

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv('2012-18_playerBoxScore.csv', encoding='utf_8')

In [None]:
pd.set_option('display.max_columns', 500)

In [None]:
def nas_sorted(df):
    return df.isnull().sum().sort_values(ascending = False)

In [None]:
df.shape

In [None]:
df.T

In [None]:
sorted_nas = nas_sorted(df)

In [None]:
sorted_nas[:5]

No missing value were found in this dataset

## Find outliers

In [None]:
df.columns = df.columns.str.lstrip()

In [None]:
np.arange(-3, 4, 1)

In [None]:
from scipy import stats

def percentiles(column):
    z_scores = stats.zscore(column)
    # segment based on number of standard deviations away from the mean     
    hist, bin_edges = np.histogram(z_scores, bins=np.arange(-3, 4, 1), density=True)
    return np.stack((hist, bin_edges[1:]))

In [None]:
percentiles(df['opptDayOff'])

In [None]:
def too_many_outliers(column, threshold = .05):
    #  expected .021 if normal distribution
    z_less_neg_two = percentiles(column)[0, 0]
    z_gt_two = percentiles(column)[0, -1]
    if z_less_neg_two > threshold or z_gt_two > threshold:
        return np.hstack((column.name, z_less_neg_two, z_gt_two))
    else:
        False

In [None]:
def outlier_columns(df, threshold = .05):
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    outlier_columns = np.array([too_many_outliers(df[column]) for column in numeric_columns])
    return np.array([column for column in outlier_columns if column is not None])

In [None]:
def select_outliers(column, upper_tail = True):
    if upper_tail:
        return column[stats.zscore(column) > 2]
    else:
        return column[stats.zscore(column) < -2]

In [None]:
np.seterr(divide='ignore', invalid='ignore') # to solve RuntimeWarning: invalid value encountered in divide
outlier_columns(df)

In [None]:
select_outliers(df['playPF']).value_counts()

In [None]:
df['playPF'].value_counts(normalize=True)

In [None]:
df['playPF']

In [None]:
df['play3P%'].hist(bins=100)

'playPF' means Personal fouls made by player and the distribution looks reasonable

'play3P%' means Three point percentage made by player and it is also reasonable since a lot of players do not shoot 3