## Data Preprocessing

This notebook first generates standardized names from fantasy salary information by querying Basketball-Reference.com, and then merge two datasets while adding fantasy stats based on DraftKings rules. Position information is also modified and added from salary datasets.

In [1]:
import os
import glob
import time
import pickle
import numpy as np
import pandas as pd

from tqdm import tqdm_notebook as tqdm
from datetime import datetime
from urllib.request import urlopen
from bs4 import BeautifulSoup

from utils
from constants import DATA_DIR, SECONDS_SLEEP, DF_VARIABLES

In [2]:
pd.set_option("display.max_columns", 40)

### Add Fantasy Stats from Boxscores

In [3]:
# Add additional columns for double double (DD) and triple double (TD)
# Double digit in two or three of points, rebounds, assists, steals and blocks
def add_doubles(df):
    dd = [0 for i in range(df.shape[0])]
    td = [0 for i in range(df.shape[0])]

    for i in tqdm(range(df.shape[0])):
        doubles_count = 0
        check_doubles = ["PTS", "TRB", "AST", "STL", "BLK"]

        for stat in check_doubles:
            if df.loc[i, stat] >= 10:
                doubles_count += 1

        if doubles_count >= 2:
            dd[i] = 1
        if doubles_count >= 3:
            td[i] = 1

    df["DD"] = dd
    df["TD"] = td

### Add Position Information on Salary Data

In [4]:
def generate_name_pos(df):
    name_pos = {}

    for name in set(df["Name"]):
        pos = df.loc[(df["Name"] == name) & (df["Pos"] != 0), "Pos"].mode()
        if len(pos) != 0:
            name_pos[name] = pos[0]

    return name_pos

In [5]:
def fill_positions(df):
    name_pos = generate_name_pos(df)

    for i in tqdm(range(df.shape[0])):
        if df.loc[i, "Pos"] == 0:
            name = df.loc[i, "Name"]
            if name in name_pos.keys():
                df.loc[i, "Pos"] = name_pos[name]

In [6]:
def add_binary_positions(df):
    zeros = [0 for i in range(df.shape[0])]
    PG, SG, F, C = zeros.copy(), zeros.copy(), zeros.copy(), zeros.copy()

    for i in range(df.shape[0]):
        if "PG" in df.loc[i, "Pos"]:
            PG[i] = 1

        elif "SG" in df.loc[i, "Pos"]:
            SG[i] = 1

        elif "C" in df.loc[i, "Pos"]:
            C[i] = 1

        else:
            F[i] = 1

    df["PG"] = PG
    df["SG"] = SG
    df["F"] = F
    df["C"] = C

### Name Standardization 

In [7]:
class NameStandardizer:
    # Use the search function on Basketball-Reference.com to generate standard names
    def parse_name(self, term, active_years):
        search_url = "https://www.basketball-reference.com/search/search.fcgi?hint=&search={term}&pid=&idx="
        name_url = search_url.format(term=term.replace(" ", "+"))
        soup = BeautifulSoup(urlopen(name_url), "lxml")

        # Check if there is ambiguity in the name
        if soup.find("h1").get_text() != "Search Results":
            return soup.find("h1").get_text()

        elif soup.find("div", id="players", class_="current") == None:
            if (len(term.split(" ")) > 2) or ("." in term):
                # Parse again without periods and with first two names
                new_term = " ".join(term.replace(".", "").split(" ")[:2])
                return self.parse_name(new_term, active_years)
            else:
                return np.nan

        else:
            items = soup.find("div", id="players", class_="current").find_all(
                "div", class_="search-item-name"
            )
            candidates = []

            for item in items:
                name = item.find("a").get_text()

                if "(" not in name:
                    candidates.append(name)

                else:
                    career = name[name.find("(") + 1 : name.find(")")].split("-")
                    if len(career) == 1:
                        if int(career[0]) in active_years:
                            candidates.append(name[: name.find(" (")])
                    else:
                        start = int(career[0])
                        end = int(career[1])

                        for year in active_years:
                            if year in range(start, end + 1):
                                candidates.append(name[: name.find(" (")])
                                break

            if len(candidates) != 0:
                for candidate in candidates:
                    if term in candidate:
                        return candidate
                return candidates[0]

            else:
                return np.nan

    def generate_standard_names(self, df, active_years):
        names = list(set(df["Name"]))
        standard_names = []
        errors, confusions = [], []

        for i, name in enumerate(tqdm(names)):
            standard_name = self.parse_name(name, active_years)

            if name != standard_name:
                print("{} From {} To {}".format(i, name, standard_name))

                if standard_name == np.nan:
                    errors.append(name)

                elif ("G-League Stats" in standard_name) or (
                    "International Stats" in standard_name
                ):
                    confusions.append(standard_name)

                else:
                    standard_names.append(standard_name)
            else:
                standard_names.append(standard_name)

            time.sleep(SECONDS_SLEEP)

        return (standard_names, errors, confusions)

    def standardize_names(self, df, standard_names, active_years):
        names = list(set(df["Name"]))

        df = df.dropna().reset_index(drop=True)

        diff = [name for name in names if name not in standard_names]
        print("{} names are standardized ...".format(len(diff)))

        names_conversion = {}

        for name in tqdm(names):
            if name in diff:
                names_conversion[name] = self.parse_name(name, active_years)
                time.sleep(SECONDS_SLEEP)

        for i in range(df.shape[0]):
            name = df.loc[i, "Name"]
            if name in names_conversion.keys():
                df.loc[i, "Name"] = names_conversion[name]

### Generate/Load Standardized Names

In [None]:
# Check if standard names are already generated
if os.path.exists(os.path.join(DATA_DIR, "Names", "standard_names.npy")):
    with open(os.path.join(DATA_DIR, "Names", "standard_names.npy"), "rb") as fp:
        standard_names = pickle.load(fp)

    with open(os.path.join(DATA_DIR, "Names", "confusions.npy"), "rb") as fp:
        confusions = pickle.load(fp)

else:
    # Generate standard names for all players from names shown in salary information from RotoGuru
    df_salary = utils.csv_concatenate(os.path.join(DATA_DIR, "DKSalary"), nested=True)

    # Specify current years to avoid duplication across eras (from 2014 to 2019)
    active_years = [2014 + i for i in range(6)]

    # Takes about 30 mins
    standardizer = NameStandardizer()
    standard_names, errors, confusions = standardizer.generate_standard_names(
        df_salary, active_years
    )

    # Create a file containing standardized names
    with open(os.path.join(DATA_DIR, "Names", "standard_names.npy"), "wb") as fp:
        pickle.dump(standard_names, fp)

    with open(os.path.join(DATA_DIR, "Names", "errors.npy"), "wb") as fp:
        pickle.dump(errors, fp)

    with open(os.path.join(DATA_DIR, "Names", "confusions.npy"), "wb") as fp:
        pickle.dump(confusions, fp)

In [None]:
# Handle edge cases manually as some name searchs return only G-league stats but not NBA records
# Cannot be differentiated from players who only played in the G-League at the moment
print(confusions)
standard_names = standard_names + ["Derrick Walton", "CJ McCollum", "Sheldon Mac"]

### Standardize Names and Merge Datasets

In [None]:
seasons = ["2014-15", "2015-16", "2016-17", "2017-18", "2018-19"]
active_years = [2014 + i for i in range(6)]

standardizer = NameStandardizer()

for season in seasons:
    print("Processing the {} season ...".format(season))

    # Standardize names for salary information
    df_salary = utils.csv_concatenate(os.path.join(DATA_DIR, "DKSalary", season))
    standardizer.standardize_names(df_salary, standard_names, active_years)
    fill_pos(df_salary)

    # Standardize names for boxscores
    df_games = utils.csv_concatenate(os.path.join(DATA_DIR, "Boxscores", season))
    df_games["FPTS"] = utils.calculate_FPTS(df_games)
    add_doubles(df_games)
    df_games = df_games.loc[:, DF_VARIABLES]
    standardizer.standardize_names(df_games, standard_names, active_years)

    # Merge two datasets and save to a csv file
    df = pd.merge(
        df_salary.drop("Team", axis=1), df_games, on=["Name", "Date"], how="inner"
    )
    df = df[df["Pos"] != 0].sort_values(by=["Date", "Team"]).reset_index(drop=True)

    # Add "Value" variable defined as a ratio between FPTS and Salary
    df["Value"] = df["FPTS"] / (df["Salary"] / 1000)
    df["Value"] = df["Value"].replace(np.inf, 0).replace(-np.inf, 0)

    # Add binary positions for later use in EDA and modelling
    add_binary_positions(df)

    columns = DF_VARIABLES.copy()

    for i, new_column in zip(
        [1, 3, 4, 7, 44, 45, 46, 47],
        ["Pos", "Salary", "Starter", "Value", "PG", "SG", "F", "C"],
    ):
        columns.insert(i, new_column)

    df = df.loc[:, columns]
    df.to_csv(
        os.path.join(DATA_DIR, "Dataframes", "Merged", "df_{}.csv".format(season)),
        index=False,
    )