### Variable Aggregation and Feature Engineering 

In [None]:
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
from datetime import datetime

In [None]:
pd.set_option("display.max_columns",30)
pd.set_option("display.max_rows",100)

In [None]:
%matplotlib inline
%load_ext Cython

In [None]:
cwd = os.getcwd().replace('/notebooks','')
data_dir = os.path.join(cwd, 'data')
season = '2014-15'
weighting = 'quad'

In [None]:
df = pd.read_csv(os.path.join(data_dir, 'Dataframes', 'clean', 'df_{}.csv'.format(season)))

### Dataframe for the Baseline Model
Mean of past N-1 games for each statistic when predicting N-th game performance

In [None]:
baseline = {'Date':[], 'Name':[], 'FPTS':[], 'PTS':[], '3P':[], 'AST':[],
            'TRB':[], 'STL':[], 'BLK':[], 'TOV':[], 'DD':[], 'TD':[]}

for i in tqdm(range(df.shape[0])):
    
    date = df.loc[i,'Date']
    name = df.loc[i,'Name']    
    
    df_name = df.loc[df['Name']==name].reset_index(drop=True)
    index = df_name.loc[df_name['Date']==date].index[0]
    
    #Check if there is past statistics available
    if index >= 1:
        
        df_past = df_name[0:index].reset_index(drop=True)
        
        for key in baseline.keys():
            if key in ['Date', 'Name', 'FPTS']:
                baseline[key].append(df_name.loc[index, key])
                
            else:
                baseline[key].append(df_past[key].mean())

In [None]:
df_baseline = pd.DataFrame(baseline)
df_baseline = df_baseline.loc[:,['Date','Name','FPTS','PTS','3P','AST','TRB','STL','BLK','TOV','DD','TD']]
df_baseline.to_csv(os.path.join(data_dir, 'Dataframes','modelling','baseline','{}.csv'.format(season)),
                   index=False)

### Additional Statistics and Recency Effect

In [None]:
def draw_weights():
    weights_dic = {}
    weighting = ['sqrt_0.5','linear_1.0', 'quad_2.0']

    for key in weighting:
        weights = np.array([np.power(i, float(key[-3:])) for i in range(1,11)])
        weights = weights/weights.sum()
        weights_dic[key[:-4]] = weights

    sns.set_style("darkgrid")

    for key in weights_dic.keys():
        plt.plot(weights_dic[key])

    plt.xlabel('n-th game', fontsize=12)
    plt.ylabel('Weight', fontsize=12)
    plt.xticks([i for i in range(0,10)], [i for i in range(1,11)])
    plt.show()

In [None]:
draw_weights()

In [None]:
%%cython
def calculate_weighted_mean(list weights, values):
    n = len(weights)
    weighted_sum = [weights[i] * values[i] for i in range(n)]
    weighted_mean = sum(weighted_sum) / sum(weights)
    
    return weighted_mean

In [None]:
data = {'Date':[], 'Name':[], 'FPTS':[], 
        #New feature
        'Rest':[], 'Value':[], 'FPTS_std':[],
        #Basic 9 Variables
        'PTS':[], '3P':[], 'AST':[], 'TRB':[], 'STL':[], 'BLK':[], 'TOV':[], 'DD':[], 'TD':[],
        #Additional Variables from Basketball-Reference.com with SD
        'MP':[], 'FT':[], 'FTA':[], 'FGA':[], '3PA':[], 'DRB':[], 'ORB':[],
        #Advanced Statistics from Basketball-Reference.com
        'USG_perc':[], 'DRtg':[], 'ORtg':[], 'AST_perc':[], 'DRB_perc':[], 'ORB_perc':[],
        'BLK_perc':[], 'TOV_perc':[], 'STL_perc':[], 'eFG_perc':[], 'FG_perc':[], '3P_perc':[], 'FT_perc':[]
       }

for i in tqdm(range(df.shape[0])):
    date = df.loc[i,'Date']
    name = df.loc[i,'Name']    
    
    df_name = df.loc[df['Name']==name].reset_index(drop=True)
    index = df_name.loc[df_name['Date']==date].index[0]
    
    if index >= 10:
        
        df_past = df_name[index-10:index].reset_index(drop=True)
        
        #Consider the number of days between the current game and the previous game
        current = datetime.strptime(str(df_name.loc[index, 'Date']), '%Y%m%d')
        previous = datetime.strptime(str(df_past.loc[df_past.shape[0]-1, 'Date']), '%Y%m%d')
        rest = current - previous
        
        data['Rest'].append(rest.days)
        
        
        #Weights higehr towards the most recent game
        
        if weighting == 'linear':
            weights = [i for i in range(1,11)]
        
        elif weighting == 'quad':
            weights = [i**2 for i in range(1,11)]
        
        elif weighting == 'sqrt':
            weights = [i**(1/2) for i in range(1,11)]
            

        for key in data.keys():
            
            if key in ['Date', 'Name', 'FPTS']:
                data[key].append(df_name.loc[index, key])
            elif key == 'FPTS_std':
                data[key].append(df_past['FPTS'].std())
            elif key != 'Rest':
                weighted_mean = calculate_weighted_mean(weights, df_past[key])
                data[key].append(weighted_mean)
                

In [None]:
df_features = pd.DataFrame(data)

### Add Starter, Listed Position and Team

In [None]:
df_features = pd.merge(df.loc[:,['Date','Name','Salary', 'Team','Pos', 'Starter', 'Home']], df_features, on=['Date','Name'], how='inner')

### Feature Engineering
Add Broader Positions and Roster Availability 

In [None]:
def add_positions(df):
    
    PG, SG, F, C = [], [], [] ,[]
    
    for i in range(df.shape[0]):
        if 'PG' in df.loc[i,'Pos']:
            PG.append(1)
            SG.append(0)
            F.append(0)
            C.append(0)
            
        elif 'SG' in df.loc[i,'Pos']:
            PG.append(0)
            SG.append(1)
            F.append(0)
            C.append(0)
        
        elif 'C' in df.loc[i,'Pos']:
            PG.append(0)
            SG.append(0)
            F.append(0)
            C.append(1)
            
        else:
            PG.append(0)
            SG.append(0)
            F.append(1)
            C.append(0)

    df['PG'] = PG
    df['SG'] = SG
    df['F'] = F
    df['C'] = C

In [None]:
def add_roster_info(df):
    df['Rota_All'] = [0 for i in range(df.shape[0])]
    df['Rota_Pos'] = [0 for i in range(df.shape[0])]

    for date in tqdm(list(set(df['Date']))):
        for team in list(set(df['Team'])):
            df_rota = df.loc[(df['Date']==date)&(df['Team']==team)]

            if df_rota.shape[0] != 0:
                rota_all = df_rota.shape[0]

                for pos in ['PG','SG', 'F', 'C']:
                    df_pos = df_rota.loc[df_rota[pos]==1]
                    rota_pos = df_pos.shape[0]

                    df.loc[(df['Date']==date) & (df['Team']==team) & (df[pos]==1), 'Rota_All']=rota_all
                    df.loc[(df['Date']==date) & (df['Team']==team) & (df[pos]==1), 'Rota_Pos']=rota_pos

In [None]:
add_positions(df_features)
add_roster_info(df_features)

In [None]:
#Plot position distribution

import plotly.plotly as py
import plotly.graph_objs as go
import plotly
plotly.offline.init_notebook_mode(connected=False)

positions = df_features.drop_duplicates(subset=['Name'])

labels = ['PG', 'SG', 'F', 'C']
values = [positions[labels[i]].sum() for i in range(4)]

trace = go.Pie(labels=labels, values=values)

layout = go.Layout(
        title = 'Distribution of Broader Positions',
        legend = {"x":0.8, 'y':1, 'borderwidth': 1},
        hovermode = 'closest',
    )

fig = go.Figure(data=[trace], layout=layout)
plotly.offline.iplot(fig)

In [None]:
columns = ['Date', 'Name', 'Team', 'Pos', 'FPTS', 'Salary',
           #Additional Features
           'Starter', 'Rest', 'Rota_All','Rota_Pos', 'Home',
           'PG', 'SG', 'F', 'C', 'Value', 'FPTS_std',
           #Basic Stats with weighted mean
           'PTS', '3P',  'AST', 'TRB', 
           'STL', 'BLK', 'TOV', 'DD', 'TD',
           #Additional Stats with weighted mean
           'MP', 'FT', 'FTA', 'FGA', '3PA', 'DRB', 'ORB', 
           #Advanced Stats with weighted mean
           'USG_perc', 'DRtg', 'ORtg', 'AST_perc', 'DRB_perc', 'ORB_perc',
           'BLK_perc', 'TOV_perc', 'STL_perc', 'eFG_perc', 'FG_perc', '3P_perc', 'FT_perc']

In [None]:
df_features = df_features.loc[:, columns]

In [None]:
df_features.to_csv(os.path.join(data_dir, 'Dataframes','modelling','features', \
                                weighting,'df_features_{}.csv'.format(season)),
                   index=False)