## Data Preprocessing

- Concatenate files into one dataframe for each
- Calculate FPTS
- Resolve name inconsistency
- Add injury information

In [29]:
import os
import glob
import time
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from datetime        import datetime
from urllib.request  import urlopen
from bs4             import BeautifulSoup

pd.set_option("display.max_columns",40)

In [51]:
cwd = os.getcwd()
data_dir = os.path.join(cwd, 'data')
season = '2015-16'

### Game Data from Basketball Reference

In [52]:
#Concatenate all csv files under a directory
def csv_concatenate(folder_path):
    files = glob.glob(folder_path + "/*.csv")
    df_list = []
    for file in tqdm(files):
        df_list.append(pd.read_csv(file, parse_dates=True, infer_datetime_format=True))
    #Fill nan with 0s as some values are empty for percentage points
    df = pd.concat(df_list).fillna(0).reset_index(drop=True)
    return df

In [53]:
def calculate_FPTS(df):
    #Scoring rules based on https://www.draftkings.co.uk/help/rules/4
    multipliers = {'PTS':1, '3P': 0.5, 'TRB':1.25, 'AST':1.5, 'STL':2, 'BLK':2, 'TOV':-0.5}

    fpts_list = []
    
    for i in tqdm(range(df.shape[0])):
        fpts = 0
        doubles_count = 0
        for stat, multiplier in multipliers.items():
            if stat in ['PTS', 'TRB', 'AST', 'STL', 'BLK']:
                if df.loc[i, stat] >= 10:
                    doubles_count += 1
            fpts += df.loc[i, stat]*multiplier
        
        if doubles_count >= 2:
            fpts += 1.5
            
        if doubles_count >= 3:
            fpts += 3
            
        fpts_list.append(fpts) 
        
    return fpts_list

In [54]:
def add_doubles(df):
    
    dd = [0 for i in range(df.shape[0])]
    td = [0 for i in range(df.shape[0])]
    
    for i in tqdm(range(df.shape[0])):
        doubles_count = 0
        check_doubles = ['PTS','TRB', 'AST', 'STL', 'BLK']
        
        for stat in check_doubles:
            if df.loc[i, stat] >= 10:
                doubles_count += 1
        
        if doubles_count >= 2:
            dd[i] = 1
        if doubles_count >= 3:
            td[i] = 1
   
    df['DD'] = dd
    df['TD'] = td

In [55]:
df_games = csv_concatenate(os.path.join(data_dir, 'Games', season))
df_games['FPTS'] = calculate_FPTS(df_games)
add_doubles(df_games)
columns = ['Name', 'Date', 'Team',  'FPTS', 'Home','W', 'W_PTS', 'L', 'L_PTS', 'MP',
           'FG', 'FGA', 'FG_perc', '3P', '3PA', '3P_perc', 'FT', 'FTA', 'FT_perc',
           'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'DD', 'TD', 
           'USG_perc','DRtg','ORtg','AST_perc','DRB_perc','ORB_perc','BLK_perc','TOV_perc','STL_perc','eFG_perc']
df_games = df_games.loc[:, columns]










### Name Standardization 

In [56]:
def parse_name(term):
    search_url = 'https://www.basketball-reference.com/search/search.fcgi?hint=&search={term}&pid=&idx='
    name_url = search_url.format(term=term.replace(' ','+'))
    soup = BeautifulSoup(urlopen(name_url),'html5lib')
    
    #Check if there is ambiguity in the name
    if soup.find('h1').get_text() != 'Search Results':
        return soup.find('h1').get_text()
    
    
    elif (soup.find('div', id='players', class_='current') == None):
        if (len(term.split(' ')) > 2) or ('.' in term):
            #Parse again without periods and with first two names
            new_term = ' '.join(term.replace('.','').split(' ')[:2])
            return parse_name(new_term)
        else:
            return np.nan
                    
    else:        
        items = soup.find('div', id='players', class_='current').find_all('div', class_='search-item-name')
        current_years = (int(season[:4]), int(season[:2]+season[-2:]))
        candidates = []
        
        for item in items:
            name = item.find('a').get_text()

            if '(' not in name:
                candidates.append(name)

            else:
                career = name[name.find('(')+1:name.find(')')].split('-')
                if len(career) == 1:
                    if int(career[0]) in current_years:
                        candidates.append(name[:name.find(' (')])
                else:
                    start = int(career[0])
                    end = int(career[1])

                    for year in current_years: 
                        if year in range(start, end+1):
                            candidates.append(name[:name.find(' (')])
                            break
                            
        if len(candidates) != 0:
            for candidate in candidates:
                if term in candidate:
                    return candidate
            return candidates[0]
        
        else:
            return np.nan
      

In [57]:
def generate_standard_names(df):
    names = list(set(df['Name']))
    standard_names = []
    
    for i, name in enumerate(names):
        standard_name = parse_name(name)
        print(i, standard_name)
        standard_names.append(standard_name)
        time.sleep(1)
    
    return standard_names

In [58]:
def standardize_names(df, standard_names):
    names = list(set(df['Name']))
    
    df = df.dropna().reset_index(drop=True)
    
    diff = [name for name in names if name not in standard_names]
    print('{} items are standardized ...'.format(len(diff)))
    
    names_conversion = {}
    
    for name in tqdm(names):
        if name in diff:
            names_conversion[name] = parse_name(name)
            time.sleep(1)
    
    for i in range(df.shape[0]):
        name = df.loc[i,'Name']
        if name in names_conversion.keys():
            df.loc[i,'Name'] = names_conversion[name]
            

In [59]:
def generate_name_pos(df):
    
    name_pos = {}
    
    for name in set(df['Name']):
        pos = df.loc[(df['Name']==name) & (df['Pos']!=0), 'Pos'].mode()
        if len(pos) != 0:
            name_pos[name] = pos[0]
    
    return name_pos

In [60]:
def fill_pos(df):
    
    name_pos = generate_name_pos(df)
    
    for i in tqdm(range(df.shape[0])):
        if df.loc[i, 'Pos'] == 0:
            name = df.loc[i, 'Name']
            if name in name_pos.keys():
                #print(df.loc[i,'Date'], name)
                df.loc[i, 'Pos'] = name_pos[name]

In [61]:
df_salary = csv_concatenate(os.path.join(data_dir, 'DKSalary', season))




In [62]:
#Takes about 30 mins

#standard_names = generate_standard_names(df_salary)
#with open(os.path.join(data_dir, 'standard_names','{}.npy'.format(season)), "wb") as fp:
#    pickle.dump(standard_names, fp) 

0 Ramon Sessions
1 Emmanuel Mudiay
2 Anthony Davis
3 Ricky Rubio
4 Dewayne Dedmon
5 Devin Booker
6 nan
7 Bryce Cotton
8 Nick Collison
9 Jonas Jerebko
10 Isaiah Canaan
11 Thaddeus Young
12 Dirk Nowitzki
13 Shabazz Muhammad
14 Raul Neto
15 Brandon Knight
16 Kendall Marshall
17 Kirk Hinrich
18 Gary Neal
19 Lavoy Allen
20 Kevin Martin
21 C.J. Watson
22 Willie Reed
23 Isaiah Thomas
24 Pablo Prigioni
25 J.J. Hickson
26 Mario Chalmers
27 Sergey Karasev
28 Shabazz Napier
29 Kyrie Irving
30 R.J. Hunter
31 Willie Cauley-Stein
32 Maurice Harkless
33 Ryan Anderson
34 Mike Conley
35 J.J. Redick
36 Gerald Green
37 Steven Adams
38 Chris Paul
39 Chris Andersen
40 Cory Jefferson
41 Langston Galloway
42 Glenn Robinson III
43 Brandon Jennings
44 Elfrid Payton
45 Joe Ingles
46 Frank Kaminsky
47 J.R. Smith
48 Nikola Jokic
49 Al-Farouq Aminu
50 Gorgui Dieng
51 Nikola Mirotic
52 Anthony Morrow
53 Rajon Rondo
54 Archie Goodwin
55 Andrea Bargnani
56 Michael Beasley
57 Jimmy Butler
58 Gerald Henderson
59 Gianni

464 Paul Pierce
465 Leandro Barbosa
466 Kristaps Porzingis
467 Avery Bradley
468 Nikola Vucevic
469 Will Barton
470 Cody Zeller
471 Kosta Koufos
472 Nene
473 Justise Winslow
474 Eric Moreland
475 Kobe Bryant
476 Corey Brewer
477 Mike Dunleavy
478 Luis Scola
479 Kevin Seraphin
480 Rondae Hollis-Jefferson
481 LaMarcus Aldridge


In [63]:
with open(os.path.join(data_dir, 'standard_names', '{}.npy'.format(season)), "rb") as fp:
    standard_names = pickle.load(fp)

FileNotFoundError: [Errno 2] No such file or directory: '/home/kengo/Desktop/NBA/data/standard_names/2015-16.npy'

In [42]:
standardize_names(df_salary, standard_names)

18 items are standardized ...





In [43]:
standardize_names(df_games, standard_names)

7 items are standardized ...





In [18]:
fill_pos(df_salary)




In [45]:
df = pd.merge(df_salary.drop('Team', axis=1), df_games, on=['Name', 'Date'], how='inner')
df = df[df['Pos']!=0].sort_values(by=['Date','Team']).reset_index(drop=True)

In [47]:
df.columns

Index(['Date', 'Pos', 'Name', 'Starter', 'Salary', 'Team', 'FPTS', 'Home', 'W',
       'W_PTS', 'L', 'L_PTS', 'MP', 'FG', 'FGA', 'FG_perc', '3P', '3PA',
       '3P_perc', 'FT', 'FTA', 'FT_perc', 'ORB', 'DRB', 'TRB', 'AST', 'STL',
       'BLK', 'TOV', 'PF', 'PTS', 'DD', 'TD', 'USG_perc', 'DRtg', 'ORtg',
       'AST_perc', 'DRB_perc', 'ORB_perc', 'BLK_perc', 'TOV_perc', 'STL_perc',
       'eFG_perc'],
      dtype='object')

In [49]:
columns = ['Name', 'Date', 'Team', 'FPTS', 'Home', 'W', 'W_PTS', 'L', 'L_PTS', 'MP',
           'FG', 'FGA', 'FG_perc', '3P', '3PA', '3P_perc', 'FT', 'FTA', 'FT_perc',
           'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'DD', 'TD', 
           'USG_perc','DRtg','ORtg','AST_perc','DRB_perc','ORB_perc','BLK_perc','TOV_perc','STL_perc','eFG_perc']

columns = columns[:3] + ['Salary', 'Starter'] + columns[3:]

df_games = df_games.loc[:, columns]

df.to_csv(os.path.join(data_dir, 'Dataframes', 'clean','df_{}.csv'.format(season)), index=False)

In [50]:
df.columns

Index(['Date', 'Pos', 'Name', 'Starter', 'Salary', 'Team', 'FPTS', 'Home', 'W',
       'W_PTS', 'L', 'L_PTS', 'MP', 'FG', 'FGA', 'FG_perc', '3P', '3PA',
       '3P_perc', 'FT', 'FTA', 'FT_perc', 'ORB', 'DRB', 'TRB', 'AST', 'STL',
       'BLK', 'TOV', 'PF', 'PTS', 'DD', 'TD', 'USG_perc', 'DRtg', 'ORtg',
       'AST_perc', 'DRB_perc', 'ORB_perc', 'BLK_perc', 'TOV_perc', 'STL_perc',
       'eFG_perc'],
      dtype='object')