# NBA AI - Data Loading and Cleaning

### Imports and Global Settings

In [1]:
import numpy as np
import pandas as pd
import re
import json
from datetime import timedelta

pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 1000)

In [2]:
def load_and_clean_nba_data(file_path):
    """
    Loads and cleans NBA game data from the 'NBA_Box_Score_Team-Stats.xlsx' file provided by BIGDATABALL.

    This function is tailored to handle the specific format of the BIGDATABALL NBA dataset. It performs the following steps:
    1. Loads the first sheet of the 'NBA_Box_Score_Team-Stats.xlsx' file into a Pandas DataFrame.
    2. Converts column names to snake case for consistency and easier access.
    3. Converts the 'date' column to a Pandas datetime format for proper time series analysis.
    4. Concatenates player names in the starting lineup columns into a single 'starting_lineup' column.
    5. Drops the original starting lineup columns and other specified columns like 'box_score_url' and 'full_game_odds_url'.

    Parameters:
    - file_path (str): The file path of the 'NBA_Box_Score_Team-Stats.xlsx' file to be loaded.

    Returns:
    - pandas.DataFrame: A cleaned DataFrame containing the NBA game data from BIGDATABALL.
    """

    # Helper function to convert column names to snake case
    def to_snake_case(name):
        name = re.sub(r"\s+", "_", name)
        name = re.sub(r"\W+", "_", name)
        name = re.sub(r"_+", "_", name)
        name = re.sub(r"^_|_$", "", name)
        return name.lower()

    # Load the first sheet into a DataFrame
    df = pd.read_excel(file_path, sheet_name=0)

    # Rename columns to snake case
    df.columns = [to_snake_case(col) for col in df.columns]

    # Convert 'date' column to datetime
    df["date"] = pd.to_datetime(df["date"])

    # Extract the season by finding the last occurrence of four digits
    df["season"] = df["bigdataball_dataset"].str.extract(r"(\d{4})(?!.*\d)")

    # Extract the season type by taking everything after the last four digits
    df["season_type"] = df["bigdataball_dataset"].str.extract(r"(\d{4})\s*(.*)")[1]

    # Filter columns that start with 'unnamed'
    unnamed_columns = [col for col in df.columns if col.startswith("unnamed")]

    # Ensure that the 'starting_lineups' column is included
    columns_to_concatenate = ["starting_lineups"] + unnamed_columns

    # Concatenate the columns to create the full starting lineup
    df["starting_lineup"] = df[columns_to_concatenate].apply(
        lambda row: ",".join(row.dropna().astype(str)), axis=1
    )

    # Drop the original lineup columns
    df.drop(columns=columns_to_concatenate, inplace=True)

    # Remove the 'box_score_url' and 'full_game_odds_url' columns
    df = df.drop(["box_score_url", "full_game_odds_url"], axis=1)

    return df

In [3]:
df_1 = load_and_clean_nba_data("../data/nba_ai/2022-2023_NBA_Box_Score_Team-Stats.xlsx")

In [4]:
df_1.head()

Unnamed: 0,bigdataball_dataset,game_id,date,team,venue,1q,2q,3q,4q,ot1,ot2,ot3,ot4,ot5,f,min,fg,fga,3p,3pa,ft,fta,or,dr,tot,a,pf,st,to,to_to,bl,pts,poss,pace,oeff,deff,team_rest_days,crew_chief,referee_umpire,opening_odds,opening_spread,opening_total,line_movement_1,line_movement_2,line_movement_3,closing_odds,closing_spread,closing_total,moneyline,halftime,season,season_type,starting_lineup
0,NBA 2022-2023 Regular Season,22200001,2022-10-18,Philadelphia,R,29,34,25,29,,,,,,117,240.0,40,80,13,34,24,28,4,27,31,16,25,8,14,14,3,117,98.680535,98.680535,118.564416,127.684756,3+,James Capers,Ray Acosta,213.5o -10,4.0,213.5,216o,216u,o216,216.5o -09,3.0,216.5,127,106.5o -15,2023,-2023 Regular Season,"Tobias Harris,P.J. Tucker,Joel Embiid,Tyrese M..."
1,NBA 2022-2023 Regular Season,22200001,2022-10-18,Boston,H,24,39,35,28,,,,,,126,240.0,46,82,12,35,22,28,6,30,36,24,24,8,10,11,3,126,98.680535,98.680535,127.684756,118.564416,3+,,Brian Forte,-4 -10,-4.0,213.5,-3,-3,-3,-3 -08,-3.0,216.5,-150,+2 -14,2023,-2023 Regular Season,"Jaylen Brown,Jayson Tatum,Al Horford,Derrick W..."
2,NBA 2022-2023 Regular Season,22200002,2022-10-18,LA Lakers,R,22,30,19,38,,,,,,109,240.0,40,94,10,40,19,25,9,39,48,23,18,12,21,22,4,109,114.091809,114.091809,95.537095,107.807915,3+,Tony Brothers,Scott Twardoski,227.5o -09,6.0,227.5,224u,224o,224u,223.5o -10,7.5,223.5,247,113.5o -17,2023,-2023 Regular Season,"Lonnie Walker IV,LeBron James,Anthony Davis,Ru..."
3,NBA 2022-2023 Regular Season,22200002,2022-10-18,Golden State,H,25,34,32,32,,,,,,123,240.0,45,99,16,45,17,23,11,37,48,31,23,11,18,18,4,123,114.091809,114.091809,107.807915,95.537095,3+,,Rodney Mott,-6 -10,-6.0,227.5,-7.5,-7.5,-7.5,-7.5 -09,-7.5,223.5,-306,-2.5 -14,2023,-2023 Regular Season,"Andrew Wiggins,Draymond Green,Kevon Looney,Kla..."
4,NBA 2022-2023 Regular Season,22200003,2022-10-19,Orlando,R,28,27,28,26,,,,,,109,240.0,42,86,11,30,14,19,10,38,48,21,24,5,18,18,5,109,101.130503,101.130503,107.781527,111.736812,3+,Sean Corbin,Mousa Dagher,218o -12,4.0,218.0,217.5u,215.5o,o215,215o -12,3.5,215.0,135,106u 15,2023,-2023 Regular Season,"Paolo Banchero,Franz Wagner,Wendell Carter Jr...."


In [5]:
def merge_and_transform_nba_data(df):
    """
    Transforms the cleaned NBA DataFrame to create a single record for each game with updated handling of referee and odds information.

    This function:
    1. Splits the DataFrame into separate DataFrames for home and road teams.
    2. Renames and prefixes columns to indicate home or road team.
    3. Merges these DataFrames to create a single row per game.
    4. Removes the 'home_venue' and 'road_venue' columns as they are redundant.
    5. Combines the 'main_ref/crew_chief' and 'crew/referee_umpire' columns from both home and road records.
    6. Removes unused betting data columns.

    Parameters:
    df (pandas.DataFrame): The cleaned DataFrame containing NBA data.

    Returns:
    pandas.DataFrame: A transformed DataFrame with one row per game, combined referee information, and betting data.
    """

    # Splitting the DataFrame into home and road teams
    home_df = df[df["venue"] == "H"].copy()
    road_df = df[df["venue"] == "R"].copy()

    # Renaming columns for clarity
    home_df.rename(columns={"team": "home_team"}, inplace=True)
    road_df.rename(columns={"team": "road_team"}, inplace=True)

    # Adding a prefix to all relevant columns
    for col in home_df.columns:
        if col not in [
            "bigdataball_dataset",
            "game_id",
            "date",
            "season",
            "season_type",
            "home_team",
        ]:
            home_df.rename(columns={col: "home_" + col}, inplace=True)

    for col in road_df.columns:
        if col not in [
            "bigdataball_dataset",
            "game_id",
            "date",
            "season",
            "season_type",
            "road_team",
        ]:
            road_df.rename(columns={col: "road_" + col}, inplace=True)

    # Merging the DataFrames on common columns
    merged_df = pd.merge(
        home_df,
        road_df,
        left_on=["bigdataball_dataset", "game_id", "date", "season", "season_type"],
        right_on=["bigdataball_dataset", "game_id", "date", "season", "season_type"],
    )

    # Removing the 'home_venue' and 'road_venue' columns
    merged_df.drop(["home_venue", "road_venue"], axis=1, inplace=True)

    # Check which set of columns is present and set variables accordingly
    if "home_main_ref" in merged_df.columns and "road_main_ref" in merged_df.columns:
        main_ref_cols = ["home_main_ref", "road_main_ref"]
        crew_cols = ["home_crew", "road_crew"]
        main_ref_output = "main_ref"
        crew_output = "crew"
    elif (
        "home_crew_chief" in merged_df.columns
        and "road_crew_chief" in merged_df.columns
    ):
        main_ref_cols = ["home_crew_chief", "road_crew_chief"]
        crew_cols = ["home_referee_umpire", "road_referee_umpire"]
        main_ref_output = "crew_chief"
        crew_output = "referee_umpire"
    else:
        raise ValueError("Expected columns not found in DataFrame")

    # Process main_ref/crew_chief columns
    merged_df[main_ref_output] = merged_df.apply(
        lambda x: x[main_ref_cols[0]]
        if pd.notna(x[main_ref_cols[0]])
        else x[main_ref_cols[1]],
        axis=1,
    )
    merged_df.drop(main_ref_cols, axis=1, inplace=True)

    # Combine crew/referee_umpire columns
    def combine_crew(crew1, crew2):
        all_crew = set(crew1.split(",")) | set(crew2.split(","))
        return ",".join(sorted(all_crew - {""}))

    merged_df[crew_output] = merged_df.apply(
        lambda x: combine_crew(x[crew_cols[0]], x[crew_cols[1]]),
        axis=1,
    )
    merged_df.drop(crew_cols, axis=1, inplace=True)

    # Remove unused betting data columns
    merged_df = merged_df.rename(
        columns={
            "home_opening_total": "opening_total",
            "home_closing_total": "closing_total",
        }
    )

    unused_odds_columns = [
        "home_line_movement_1",
        "home_line_movement_2",
        "home_line_movement_3",
        "road_line_movement_1",
        "road_line_movement_2",
        "road_line_movement_3",
        "home_halftime",
        "road_halftime",
        "home_opening_odds",
        "road_opening_odds",
        "home_closing_odds",
        "road_closing_odds",
        "road_opening_total",
        "road_closing_total",
        "road_opening_spread",
        "road_closing_spread",
    ]
    merged_df.drop(unused_odds_columns, axis=1, inplace=True)

    return merged_df

In [6]:
df_2 = merge_and_transform_nba_data(df_1)

In [7]:
df_2.head()

Unnamed: 0,bigdataball_dataset,game_id,date,home_team,home_1q,home_2q,home_3q,home_4q,home_ot1,home_ot2,home_ot3,home_ot4,home_ot5,home_f,home_min,home_fg,home_fga,home_3p,home_3pa,home_ft,home_fta,home_or,home_dr,home_tot,home_a,home_pf,home_st,home_to,home_to_to,home_bl,home_pts,home_poss,home_pace,home_oeff,home_deff,home_team_rest_days,home_opening_spread,opening_total,home_closing_spread,closing_total,home_moneyline,season,season_type,home_starting_lineup,road_team,road_1q,road_2q,road_3q,road_4q,road_ot1,road_ot2,road_ot3,road_ot4,road_ot5,road_f,road_min,road_fg,road_fga,road_3p,road_3pa,road_ft,road_fta,road_or,road_dr,road_tot,road_a,road_pf,road_st,road_to,road_to_to,road_bl,road_pts,road_poss,road_pace,road_oeff,road_deff,road_team_rest_days,road_moneyline,road_starting_lineup,crew_chief,referee_umpire
0,NBA 2022-2023 Regular Season,22200001,2022-10-18,Boston,24,39,35,28,,,,,,126,240.0,46,82,12,35,22,28,6,30,36,24,24,8,10,11,3,126,98.680535,98.680535,127.684756,118.564416,3+,-4.0,213.5,-3.0,216.5,-150,2023,-2023 Regular Season,"Jaylen Brown,Jayson Tatum,Al Horford,Derrick W...",Philadelphia,29,34,25,29,,,,,,117,240.0,40,80,13,34,24,28,4,27,31,16,25,8,14,14,3,117,98.680535,98.680535,118.564416,127.684756,3+,127,"Tobias Harris,P.J. Tucker,Joel Embiid,Tyrese M...",James Capers,"Brian Forte,Ray Acosta"
1,NBA 2022-2023 Regular Season,22200002,2022-10-18,Golden State,25,34,32,32,,,,,,123,240.0,45,99,16,45,17,23,11,37,48,31,23,11,18,18,4,123,114.091809,114.091809,107.807915,95.537095,3+,-6.0,227.5,-7.5,223.5,-306,2023,-2023 Regular Season,"Andrew Wiggins,Draymond Green,Kevon Looney,Kla...",LA Lakers,22,30,19,38,,,,,,109,240.0,40,94,10,40,19,25,9,39,48,23,18,12,21,22,4,109,114.091809,114.091809,95.537095,107.807915,3+,247,"Lonnie Walker IV,LeBron James,Anthony Davis,Ru...",Tony Brothers,"Rodney Mott,Scott Twardoski"
2,NBA 2022-2023 Regular Season,22200003,2022-10-19,Detroit,17,40,34,22,,,,,,113,240.0,40,94,14,38,19,24,12,29,41,31,21,11,12,13,4,113,101.130503,101.130503,111.736812,107.781527,3+,-4.0,218.0,-3.5,215.0,-162,2023,-2023 Regular Season,"Saddiq Bey,Bojan Bogdanovic,Isaiah Stewart,Cad...",Orlando,28,27,28,26,,,,,,109,240.0,42,86,11,30,14,19,10,38,48,21,24,5,18,18,5,109,101.130503,101.130503,107.781527,111.736812,3+,135,"Paolo Banchero,Franz Wagner,Wendell Carter Jr....",Sean Corbin,"David Guthrie,Mousa Dagher"
3,NBA 2022-2023 Regular Season,22200004,2022-10-19,Indiana,25,27,25,30,,,,,,107,240.0,39,97,15,42,14,21,12,30,42,21,20,7,15,15,5,107,103.68746,103.68746,103.194736,109.945793,3+,2.5,227.0,2.5,228.5,114,2023,-2023 Regular Season,"Buddy Hield,Terry Taylor,Jalen Smith,Chris Dua...",Washington,36,24,27,27,,,,,,114,240.0,42,92,11,31,19,24,14,39,53,21,19,5,16,17,10,114,103.68746,103.68746,109.945793,103.194736,3+,-137,"Deni Avdija,Kyle Kuzma,Kristaps Porzingis,Brad...",Scott Foster,"Ashley Moyer-Gleich,Brent Barnaky"
4,NBA 2022-2023 Regular Season,22200005,2022-10-19,Atlanta,26,33,25,33,,,,,,117,240.0,45,90,7,25,20,24,4,34,38,30,18,12,9,9,5,117,102.889037,102.889037,113.71474,103.995531,3+,-9.5,233.5,-10.5,234.5,-505,2023,-2023 Regular Season,"De'Andre Hunter,John Collins,Clint Capela,Dejo...",Houston,20,30,30,27,,,,,,107,240.0,42,98,9,35,14,15,15,39,54,25,20,4,15,16,3,107,102.889037,102.889037,103.995531,113.71474,3+,378,"Eric Gordon,Jabari Smith Jr.,Bruno Fernando,Ja...",Ed Malloy,"Ben Taylor,Jenna Reneau"


In [8]:
def add_sequence_data(df):
    # Calculate the 'Day of Season'
    df["day_of_season"] = (df["date"] - df["date"].min()).dt.days + 1

    df = df.sort_values(by="date")

    # Function to calculate the game number
    def calculate_game_number(row, team_column, df):
        return len(
            df[
                (
                    (df["home_team"] == row[team_column])
                    | (df["road_team"] == row[team_column])
                )
                & (df["date"] <= row["date"])
            ]
        )

    # Calculate game numbers
    df["home_team_game_num"] = df.apply(
        calculate_game_number, team_column="home_team", df=df, axis=1
    )
    df["road_team_game_num"] = df.apply(
        calculate_game_number, team_column="road_team", df=df, axis=1
    )

    return df

In [9]:
df_3 = add_sequence_data(df_2)

In [10]:
df_3.head()

Unnamed: 0,bigdataball_dataset,game_id,date,home_team,home_1q,home_2q,home_3q,home_4q,home_ot1,home_ot2,home_ot3,home_ot4,home_ot5,home_f,home_min,home_fg,home_fga,home_3p,home_3pa,home_ft,home_fta,home_or,home_dr,home_tot,home_a,home_pf,home_st,home_to,home_to_to,home_bl,home_pts,home_poss,home_pace,home_oeff,home_deff,home_team_rest_days,home_opening_spread,opening_total,home_closing_spread,closing_total,home_moneyline,season,season_type,home_starting_lineup,road_team,road_1q,road_2q,road_3q,road_4q,road_ot1,road_ot2,road_ot3,road_ot4,road_ot5,road_f,road_min,road_fg,road_fga,road_3p,road_3pa,road_ft,road_fta,road_or,road_dr,road_tot,road_a,road_pf,road_st,road_to,road_to_to,road_bl,road_pts,road_poss,road_pace,road_oeff,road_deff,road_team_rest_days,road_moneyline,road_starting_lineup,crew_chief,referee_umpire,day_of_season,home_team_game_num,road_team_game_num
0,NBA 2022-2023 Regular Season,22200001,2022-10-18,Boston,24,39,35,28,,,,,,126,240.0,46,82,12,35,22,28,6,30,36,24,24,8,10,11,3,126,98.680535,98.680535,127.684756,118.564416,3+,-4.0,213.5,-3.0,216.5,-150,2023,-2023 Regular Season,"Jaylen Brown,Jayson Tatum,Al Horford,Derrick W...",Philadelphia,29,34,25,29,,,,,,117,240.0,40,80,13,34,24,28,4,27,31,16,25,8,14,14,3,117,98.680535,98.680535,118.564416,127.684756,3+,127,"Tobias Harris,P.J. Tucker,Joel Embiid,Tyrese M...",James Capers,"Brian Forte,Ray Acosta",1,1,1
1,NBA 2022-2023 Regular Season,22200002,2022-10-18,Golden State,25,34,32,32,,,,,,123,240.0,45,99,16,45,17,23,11,37,48,31,23,11,18,18,4,123,114.091809,114.091809,107.807915,95.537095,3+,-6.0,227.5,-7.5,223.5,-306,2023,-2023 Regular Season,"Andrew Wiggins,Draymond Green,Kevon Looney,Kla...",LA Lakers,22,30,19,38,,,,,,109,240.0,40,94,10,40,19,25,9,39,48,23,18,12,21,22,4,109,114.091809,114.091809,95.537095,107.807915,3+,247,"Lonnie Walker IV,LeBron James,Anthony Davis,Ru...",Tony Brothers,"Rodney Mott,Scott Twardoski",1,1,1
13,NBA 2022-2023 Regular Season,22200014,2022-10-19,Sacramento,23,32,29,24,,,,,,108,240.0,39,85,17,44,13,19,4,37,41,27,25,8,15,16,5,108,101.731855,101.731855,106.161438,113.042271,3+,-1.5,223.5,-3.0,229.5,-149,2023,-2023 Regular Season,"Harrison Barnes,KZ Okpala,Domantas Sabonis,Kev...",Portland,32,19,33,31,,,,,,115,240.0,39,88,11,28,26,33,11,33,44,20,17,11,11,11,2,115,101.731855,101.731855,113.042271,106.161438,3+,124,"Josh Hart,Jerami Grant,Jusuf Nurkic,Anfernee S...",Courtney Kirkland,"Brandon Adair,Justin Van Duyne",2,1,1
12,NBA 2022-2023 Regular Season,22200013,2022-10-19,Phoenix,24,21,31,31,,,,,,107,240.0,40,85,8,22,19,22,8,32,40,25,29,4,12,12,5,107,95.829038,95.829038,111.657178,109.570128,3+,-5.5,216.0,-4.0,218.0,-180,2023,-2023 Regular Season,"Mikal Bridges,Cameron Johnson,Deandre Ayton,De...",Dallas,32,30,19,24,,,,,,105,240.0,35,75,14,35,21,34,5,35,40,17,22,6,12,12,4,105,95.829038,95.829038,109.570128,111.657178,3+,149,"Reggie Bullock,Dorian Finney-Smith,JaVale McGe...",Derek Richardson,"Eric Lewis,Gediminas Petraitis",2,1,1
11,NBA 2022-2023 Regular Season,22200012,2022-10-19,Utah,37,38,19,29,,,,,,123,240.0,42,83,16,38,23,31,11,32,43,27,21,10,19,20,1,123,101.120258,101.120258,121.637348,100.869996,3+,7.5,218.0,7.0,225.5,213,2023,-2023 Regular Season,"Lauri Markkanen,Kelly Olynyk,Jarred Vanderbilt...",Denver,30,23,27,22,,,,,,102,240.0,40,83,5,22,17,18,10,25,35,21,23,10,21,21,3,102,101.120258,101.120258,100.869996,121.637348,3+,-261,"Michael Porter Jr.,Aaron Gordon,Nikola Jokic,K...",Tony Brothers,"Kevin Cutler,Lauren Holtkamp",2,1,1


In [11]:
def add_four_factors(df):
    """
    Add the Four Factors columns to an NBA game statistics DataFrame.

    This function calculates the Four Factors (Effective Field Goal Percentage,
    Turnover Rate, Offensive Rebound Rate, Free Throw Rate) for both home and road teams
    and adds these as new columns to the provided DataFrame.

    Parameters:
    df (pd.DataFrame): A DataFrame containing NBA game statistics.

    Returns:
    pd.DataFrame: The original DataFrame with added columns for the Four Factors
                  for both home and road teams.
    """

    # Define the column mappings for home and road teams
    stats_mapping = {
        "home": {
            "fgm": "home_fg",
            "fga": "home_fga",
            "3pm": "home_3p",
            "ftm": "home_ft",
            "fta": "home_fta",
            "orb": "home_or",
            "drb": "home_dr",
            "tov": "home_to_to",
        },
        "road": {
            "fgm": "road_fg",
            "fga": "road_fga",
            "3pm": "road_3p",
            "ftm": "road_ft",
            "fta": "road_fta",
            "orb": "road_or",
            "drb": "road_dr",
            "tov": "road_to_to",
        },
    }

    # Function to calculate the Four Factors for a given team type
    def calculate_four_factors(df, team_type):
        factors = {}
        stats = stats_mapping[team_type]

        # eFG%
        factors["eFG%"] = (df[stats["fgm"]] + 0.5 * df[stats["3pm"]]) / df[stats["fga"]]

        # TOV%
        factors["TOV%"] = df[stats["tov"]] / (
            df[stats["fga"]] + 0.44 * df[stats["fta"]] + df[stats["tov"]]
        )

        # ORB%
        factors["ORB%"] = df[stats["orb"]] / (
            df[stats["orb"]] + df[stats["drb"]].shift(-1)
        )

        # FT%
        factors["FT%"] = df[stats["ftm"]] / df[stats["fga"]]

        return pd.DataFrame(factors)

    # Calculate Four Factors for both home and road teams
    home_factors = calculate_four_factors(df, "home")
    road_factors = calculate_four_factors(df, "road")

    # Combine the results and add them to the original DataFrame
    four_factors = pd.concat(
        [home_factors.add_prefix("home_"), road_factors.add_prefix("road_")], axis=1
    )
    return pd.concat([df, four_factors], axis=1)

In [12]:
df_4 = add_four_factors(df_3)

In [13]:
df_4.head()

Unnamed: 0,bigdataball_dataset,game_id,date,home_team,home_1q,home_2q,home_3q,home_4q,home_ot1,home_ot2,home_ot3,home_ot4,home_ot5,home_f,home_min,home_fg,home_fga,home_3p,home_3pa,home_ft,home_fta,home_or,home_dr,home_tot,home_a,home_pf,home_st,home_to,home_to_to,home_bl,home_pts,home_poss,home_pace,home_oeff,home_deff,home_team_rest_days,home_opening_spread,opening_total,home_closing_spread,closing_total,home_moneyline,season,season_type,home_starting_lineup,road_team,road_1q,road_2q,road_3q,road_4q,road_ot1,road_ot2,road_ot3,road_ot4,road_ot5,road_f,road_min,road_fg,road_fga,road_3p,road_3pa,road_ft,road_fta,road_or,road_dr,road_tot,road_a,road_pf,road_st,road_to,road_to_to,road_bl,road_pts,road_poss,road_pace,road_oeff,road_deff,road_team_rest_days,road_moneyline,road_starting_lineup,crew_chief,referee_umpire,day_of_season,home_team_game_num,road_team_game_num,home_eFG%,home_TOV%,home_ORB%,home_FT%,road_eFG%,road_TOV%,road_ORB%,road_FT%
0,NBA 2022-2023 Regular Season,22200001,2022-10-18,Boston,24,39,35,28,,,,,,126,240.0,46,82,12,35,22,28,6,30,36,24,24,8,10,11,3,126,98.680535,98.680535,127.684756,118.564416,3+,-4.0,213.5,-3.0,216.5,-150,2023,-2023 Regular Season,"Jaylen Brown,Jayson Tatum,Al Horford,Derrick W...",Philadelphia,29,34,25,29,,,,,,117,240.0,40,80,13,34,24,28,4,27,31,16,25,8,14,14,3,117,98.680535,98.680535,118.564416,127.684756,3+,127,"Tobias Harris,P.J. Tucker,Joel Embiid,Tyrese M...",James Capers,"Brian Forte,Ray Acosta",1,1,1,0.634146,0.104444,0.139535,0.268293,0.58125,0.131678,0.093023,0.3
1,NBA 2022-2023 Regular Season,22200002,2022-10-18,Golden State,25,34,32,32,,,,,,123,240.0,45,99,16,45,17,23,11,37,48,31,23,11,18,18,4,123,114.091809,114.091809,107.807915,95.537095,3+,-6.0,227.5,-7.5,223.5,-306,2023,-2023 Regular Season,"Andrew Wiggins,Draymond Green,Kevon Looney,Kla...",LA Lakers,22,30,19,38,,,,,,109,240.0,40,94,10,40,19,25,9,39,48,23,18,12,21,22,4,109,114.091809,114.091809,95.537095,107.807915,3+,247,"Lonnie Walker IV,LeBron James,Anthony Davis,Ru...",Tony Brothers,"Rodney Mott,Scott Twardoski",1,1,1,0.535354,0.141598,0.229167,0.171717,0.478723,0.173228,0.214286,0.202128
13,NBA 2022-2023 Regular Season,22200014,2022-10-19,Sacramento,23,32,29,24,,,,,,108,240.0,39,85,17,44,13,19,4,37,41,27,25,8,15,16,5,108,101.731855,101.731855,106.161438,113.042271,3+,-1.5,223.5,-3.0,229.5,-149,2023,-2023 Regular Season,"Harrison Barnes,KZ Okpala,Domantas Sabonis,Kev...",Portland,32,19,33,31,,,,,,115,240.0,39,88,11,28,26,33,11,33,44,20,17,11,11,11,2,115,101.731855,101.731855,113.042271,106.161438,3+,124,"Josh Hart,Jerami Grant,Jusuf Nurkic,Anfernee S...",Courtney Kirkland,"Brandon Adair,Justin Van Duyne",2,1,1,0.558824,0.146306,0.111111,0.152941,0.505682,0.096899,0.23913,0.295455
12,NBA 2022-2023 Regular Season,22200013,2022-10-19,Phoenix,24,21,31,31,,,,,,107,240.0,40,85,8,22,19,22,8,32,40,25,29,4,12,12,5,107,95.829038,95.829038,111.657178,109.570128,3+,-5.5,216.0,-4.0,218.0,-180,2023,-2023 Regular Season,"Mikal Bridges,Cameron Johnson,Deandre Ayton,De...",Dallas,32,30,19,24,,,,,,105,240.0,35,75,14,35,21,34,5,35,40,17,22,6,12,12,4,105,95.829038,95.829038,109.570128,111.657178,3+,149,"Reggie Bullock,Dorian Finney-Smith,JaVale McGe...",Derek Richardson,"Eric Lewis,Gediminas Petraitis",2,1,1,0.517647,0.112486,0.2,0.223529,0.56,0.117693,0.166667,0.28
11,NBA 2022-2023 Regular Season,22200012,2022-10-19,Utah,37,38,19,29,,,,,,123,240.0,42,83,16,38,23,31,11,32,43,27,21,10,19,20,1,123,101.120258,101.120258,121.637348,100.869996,3+,7.5,218.0,7.0,225.5,213,2023,-2023 Regular Season,"Lauri Markkanen,Kelly Olynyk,Jarred Vanderbilt...",Denver,30,23,27,22,,,,,,102,240.0,40,83,5,22,17,18,10,25,35,21,23,10,21,21,3,102,101.120258,101.120258,100.869996,121.637348,3+,-261,"Michael Porter Jr.,Aaron Gordon,Nikola Jokic,K...",Tony Brothers,"Kevin Cutler,Lauren Holtkamp",2,1,1,0.60241,0.171468,0.211538,0.277108,0.512048,0.187634,0.196078,0.204819


In [14]:
def add_game_outcome(df):
    """
    Add columns indicating the winning and losing teams for each game in an NBA dataset.

    This function uses the home and road team points to determine the winner and loser of each game.
    It adds two new columns, 'winner' and 'loser', to the dataset.

    Parameters:
    df (pd.DataFrame): A DataFrame containing NBA game statistics, including home and road team points.

    Returns:
    pd.DataFrame: The original DataFrame with two new columns 'winner' and 'loser'.
    """

    # Determine the winning and losing teams based on points
    df["winner"] = df.apply(
        lambda x: x["home_team"] if x["home_pts"] > x["road_pts"] else x["road_team"],
        axis=1,
    )
    df["loser"] = df.apply(
        lambda x: x["road_team"] if x["home_pts"] > x["road_pts"] else x["home_team"],
        axis=1,
    )

    return df

In [15]:
df_5 = add_game_outcome(df_4)

In [16]:
df_5[["home_team", "road_team", "home_pts", "road_pts", "winner", "loser"]].head()

Unnamed: 0,home_team,road_team,home_pts,road_pts,winner,loser
0,Boston,Philadelphia,126,117,Boston,Philadelphia
1,Golden State,LA Lakers,123,109,Golden State,LA Lakers
13,Sacramento,Portland,108,115,Portland,Sacramento
12,Phoenix,Dallas,107,105,Phoenix,Dallas
11,Utah,Denver,123,102,Utah,Denver


In [17]:
def add_win_loss_info(df):
    """
    Add cumulative and last two weeks' statistics for wins, losses,
    and winning percentage for both home and road teams in an NBA dataset.
    """
    # Initialization
    wins, losses = {}, {}

    # Define the columns to be added
    columns = [
        "home_wins",
        "home_losses",
        "home_win_pct",
        "road_wins",
        "road_losses",
        "road_win_pct",
        "home_wins_l2w",
        "home_losses_l2w",
        "home_win_pct_l2w",
        "road_wins_l2w",
        "road_losses_l2w",
        "road_win_pct_l2w",
    ]
    for col in columns:
        df[col] = 0

    # Iterate through the DataFrame
    for index, row in df.iterrows():
        date = pd.to_datetime(row["date"])
        home_team, road_team = row["home_team"], row["road_team"]
        home_win = row["winner"] == home_team

        # Update cumulative stats
        df.at[index, "home_wins"] = wins.get(home_team, 0)
        df.at[index, "home_losses"] = losses.get(home_team, 0)
        df.at[index, "road_wins"] = wins.get(road_team, 0)
        df.at[index, "road_losses"] = losses.get(road_team, 0)

        # Update cumulative winning percentages
        if wins.get(home_team, 0) + losses.get(home_team, 0) > 0:
            df.at[index, "home_win_pct"] = wins[home_team] / (
                wins[home_team] + losses[home_team]
            )
        if wins.get(road_team, 0) + losses.get(road_team, 0) > 0:
            df.at[index, "road_win_pct"] = wins[road_team] / (
                wins[road_team] + losses[road_team]
            )

        # Filter for last two weeks' games
        l2w_start_date = date - timedelta(days=14)
        l2w_games = df[(df["date"] >= l2w_start_date) & (df["date"] < date)]

        # Calculate last two weeks' wins and losses
        l2w_home_wins = len(
            l2w_games[
                (l2w_games["home_team"] == home_team)
                & (l2w_games["winner"] == home_team)
            ]
        )
        l2w_home_losses = len(
            l2w_games[
                (l2w_games["home_team"] == home_team)
                & (l2w_games["winner"] != home_team)
            ]
        )
        l2w_road_wins = len(
            l2w_games[
                (l2w_games["road_team"] == road_team)
                & (l2w_games["winner"] == road_team)
            ]
        )
        l2w_road_losses = len(
            l2w_games[
                (l2w_games["road_team"] == road_team)
                & (l2w_games["winner"] != road_team)
            ]
        )

        # Update last two weeks' stats in the DataFrame
        df.at[index, "home_wins_l2w"] = l2w_home_wins
        df.at[index, "home_losses_l2w"] = l2w_home_losses
        df.at[index, "road_wins_l2w"] = l2w_road_wins
        df.at[index, "road_losses_l2w"] = l2w_road_losses
        df.at[index, "home_win_pct_l2w"] = (
            l2w_home_wins / (l2w_home_wins + l2w_home_losses)
            if l2w_home_wins + l2w_home_losses > 0
            else 0
        )
        df.at[index, "road_win_pct_l2w"] = (
            l2w_road_wins / (l2w_road_wins + l2w_road_losses)
            if l2w_road_wins + l2w_road_losses > 0
            else 0
        )

        # Update cumulative wins and losses for next iteration
        if home_team not in wins:
            wins[home_team] = 0
        if home_team not in losses:
            losses[home_team] = 0
        if road_team not in wins:
            wins[road_team] = 0
        if road_team not in losses:
            losses[road_team] = 0

        wins[home_team] += int(home_win)
        losses[home_team] += int(not home_win)
        wins[road_team] += int(not home_win)
        losses[road_team] += int(home_win)

    return df

In [18]:
df_6 = add_win_loss_info(df_5)

In [19]:
df_6.head()

Unnamed: 0,bigdataball_dataset,game_id,date,home_team,home_1q,home_2q,home_3q,home_4q,home_ot1,home_ot2,home_ot3,home_ot4,home_ot5,home_f,home_min,home_fg,home_fga,home_3p,home_3pa,home_ft,home_fta,home_or,home_dr,home_tot,home_a,home_pf,home_st,home_to,home_to_to,home_bl,home_pts,home_poss,home_pace,home_oeff,home_deff,home_team_rest_days,home_opening_spread,opening_total,home_closing_spread,closing_total,home_moneyline,season,season_type,home_starting_lineup,road_team,road_1q,road_2q,road_3q,road_4q,road_ot1,...,road_fg,road_fga,road_3p,road_3pa,road_ft,road_fta,road_or,road_dr,road_tot,road_a,road_pf,road_st,road_to,road_to_to,road_bl,road_pts,road_poss,road_pace,road_oeff,road_deff,road_team_rest_days,road_moneyline,road_starting_lineup,crew_chief,referee_umpire,day_of_season,home_team_game_num,road_team_game_num,home_eFG%,home_TOV%,home_ORB%,home_FT%,road_eFG%,road_TOV%,road_ORB%,road_FT%,winner,loser,home_wins,home_losses,home_win_pct,road_wins,road_losses,road_win_pct,home_wins_l2w,home_losses_l2w,home_win_pct_l2w,road_wins_l2w,road_losses_l2w,road_win_pct_l2w
0,NBA 2022-2023 Regular Season,22200001,2022-10-18,Boston,24,39,35,28,,,,,,126,240.0,46,82,12,35,22,28,6,30,36,24,24,8,10,11,3,126,98.680535,98.680535,127.684756,118.564416,3+,-4.0,213.5,-3.0,216.5,-150,2023,-2023 Regular Season,"Jaylen Brown,Jayson Tatum,Al Horford,Derrick W...",Philadelphia,29,34,25,29,,...,40,80,13,34,24,28,4,27,31,16,25,8,14,14,3,117,98.680535,98.680535,118.564416,127.684756,3+,127,"Tobias Harris,P.J. Tucker,Joel Embiid,Tyrese M...",James Capers,"Brian Forte,Ray Acosta",1,1,1,0.634146,0.104444,0.139535,0.268293,0.58125,0.131678,0.093023,0.3,Boston,Philadelphia,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0
1,NBA 2022-2023 Regular Season,22200002,2022-10-18,Golden State,25,34,32,32,,,,,,123,240.0,45,99,16,45,17,23,11,37,48,31,23,11,18,18,4,123,114.091809,114.091809,107.807915,95.537095,3+,-6.0,227.5,-7.5,223.5,-306,2023,-2023 Regular Season,"Andrew Wiggins,Draymond Green,Kevon Looney,Kla...",LA Lakers,22,30,19,38,,...,40,94,10,40,19,25,9,39,48,23,18,12,21,22,4,109,114.091809,114.091809,95.537095,107.807915,3+,247,"Lonnie Walker IV,LeBron James,Anthony Davis,Ru...",Tony Brothers,"Rodney Mott,Scott Twardoski",1,1,1,0.535354,0.141598,0.229167,0.171717,0.478723,0.173228,0.214286,0.202128,Golden State,LA Lakers,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0
13,NBA 2022-2023 Regular Season,22200014,2022-10-19,Sacramento,23,32,29,24,,,,,,108,240.0,39,85,17,44,13,19,4,37,41,27,25,8,15,16,5,108,101.731855,101.731855,106.161438,113.042271,3+,-1.5,223.5,-3.0,229.5,-149,2023,-2023 Regular Season,"Harrison Barnes,KZ Okpala,Domantas Sabonis,Kev...",Portland,32,19,33,31,,...,39,88,11,28,26,33,11,33,44,20,17,11,11,11,2,115,101.731855,101.731855,113.042271,106.161438,3+,124,"Josh Hart,Jerami Grant,Jusuf Nurkic,Anfernee S...",Courtney Kirkland,"Brandon Adair,Justin Van Duyne",2,1,1,0.558824,0.146306,0.111111,0.152941,0.505682,0.096899,0.23913,0.295455,Portland,Sacramento,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0
12,NBA 2022-2023 Regular Season,22200013,2022-10-19,Phoenix,24,21,31,31,,,,,,107,240.0,40,85,8,22,19,22,8,32,40,25,29,4,12,12,5,107,95.829038,95.829038,111.657178,109.570128,3+,-5.5,216.0,-4.0,218.0,-180,2023,-2023 Regular Season,"Mikal Bridges,Cameron Johnson,Deandre Ayton,De...",Dallas,32,30,19,24,,...,35,75,14,35,21,34,5,35,40,17,22,6,12,12,4,105,95.829038,95.829038,109.570128,111.657178,3+,149,"Reggie Bullock,Dorian Finney-Smith,JaVale McGe...",Derek Richardson,"Eric Lewis,Gediminas Petraitis",2,1,1,0.517647,0.112486,0.2,0.223529,0.56,0.117693,0.166667,0.28,Phoenix,Dallas,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0
11,NBA 2022-2023 Regular Season,22200012,2022-10-19,Utah,37,38,19,29,,,,,,123,240.0,42,83,16,38,23,31,11,32,43,27,21,10,19,20,1,123,101.120258,101.120258,121.637348,100.869996,3+,7.5,218.0,7.0,225.5,213,2023,-2023 Regular Season,"Lauri Markkanen,Kelly Olynyk,Jarred Vanderbilt...",Denver,30,23,27,22,,...,40,83,5,22,17,18,10,25,35,21,23,10,21,21,3,102,101.120258,101.120258,100.869996,121.637348,3+,-261,"Michael Porter Jr.,Aaron Gordon,Nikola Jokic,K...",Tony Brothers,"Kevin Cutler,Lauren Holtkamp",2,1,1,0.60241,0.171468,0.211538,0.277108,0.512048,0.187634,0.196078,0.204819,Utah,Denver,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0


In [20]:
def calculate_stat_average(df, current_date, team, stat, last_2_weeks=False):
    """
    Calculate the average of a given statistic for a team up to a specified date.
    If last_2_weeks is True, the calculation is restricted to the last two weeks.

    :param df: DataFrame containing the NBA data.
    :param current_date: The date of the current game.
    :param team: The team for which the statistic is calculated.
    :param stat: The statistic abbreviation (e.g., 'pts' for points).
    :param last_2_weeks: Boolean, if True, calculate the average for the last two weeks.
    :return: Average of the specified statistic.
    """
    # Filter for games involving the team before the current date
    relevant_games = df[
        (df["date"] < current_date)
        & ((df["home_team"] == team) | (df["road_team"] == team))
    ]

    # Consider only the last 2 weeks if required
    if last_2_weeks:
        two_weeks_ago = current_date - timedelta(days=14)
        relevant_games = relevant_games[relevant_games["date"] >= two_weeks_ago]

    # Calculate the total of the statistic
    total_stat = relevant_games.apply(
        lambda row: row[f"home_{stat}"]
        if row["home_team"] == team
        else row[f"road_{stat}"],
        axis=1,
    ).sum()

    # Calculate the average
    num_games = len(relevant_games)
    return total_stat / num_games if num_games != 0 else 0


def add_stats_columns(df, stats):
    """
    Adds columns for multiple statistics. For each stat, it creates columns for home_avg_stat,
    road_avg_stat, home_avg_stat_l2w, and road_avg_stat_l2w.

    :param df: DataFrame containing the NBA data.
    :param stats: List of statistic abbreviations (e.g., ['pts', 'or']).
    :return: DataFrame with added columns for each statistic.
    """

    def apply_stats(row):
        date = row["date"]
        home_team = row["home_team"]
        road_team = row["road_team"]

        for stat in stats:
            # Define new column names
            home_avg_stat = f"home_avg_{stat}"
            road_avg_stat = f"road_avg_{stat}"
            home_avg_l2w = f"home_avg_{stat}_l2w"
            road_avg_l2w = f"road_avg_{stat}_l2w"

            # Calculate and assign the averages
            row[home_avg_stat] = calculate_stat_average(df, date, home_team, stat)
            row[road_avg_stat] = calculate_stat_average(df, date, road_team, stat)
            row[home_avg_l2w] = calculate_stat_average(
                df, date, home_team, stat, last_2_weeks=True
            )
            row[road_avg_l2w] = calculate_stat_average(
                df, date, road_team, stat, last_2_weeks=True
            )

        return row

    # Apply the function to each row of the DataFrame
    return df.apply(apply_stats, axis=1)

In [21]:
stat_list = [
    "1q",
    "2q",
    "3q",
    "4q",
    "ot1",
    "ot2",
    "ot3",
    "ot4",
    "ot5",
    "f",
    "min",
    "fg",
    "fga",
    "3p",
    "3pa",
    "ft",
    "fta",
    "or",
    "dr",
    "tot",
    "a",
    "pf",
    "st",
    "to",
    "to_to",
    "bl",
    "pts",
    "poss",
    "pace",
    "oeff",
    "deff",
    "eFG%",
    "TOV%",
    "ORB%",
    "FT%",
]

df_7 = add_stats_columns(df_6, stat_list)

In [22]:
df_7.head()

Unnamed: 0,bigdataball_dataset,game_id,date,home_team,home_1q,home_2q,home_3q,home_4q,home_ot1,home_ot2,home_ot3,home_ot4,home_ot5,home_f,home_min,home_fg,home_fga,home_3p,home_3pa,home_ft,home_fta,home_or,home_dr,home_tot,home_a,home_pf,home_st,home_to,home_to_to,home_bl,home_pts,home_poss,home_pace,home_oeff,home_deff,home_team_rest_days,home_opening_spread,opening_total,home_closing_spread,closing_total,home_moneyline,season,season_type,home_starting_lineup,road_team,road_1q,road_2q,road_3q,road_4q,road_ot1,...,home_avg_st_l2w,road_avg_st_l2w,home_avg_to,road_avg_to,home_avg_to_l2w,road_avg_to_l2w,home_avg_to_to,road_avg_to_to,home_avg_to_to_l2w,road_avg_to_to_l2w,home_avg_bl,road_avg_bl,home_avg_bl_l2w,road_avg_bl_l2w,home_avg_pts,road_avg_pts,home_avg_pts_l2w,road_avg_pts_l2w,home_avg_poss,road_avg_poss,home_avg_poss_l2w,road_avg_poss_l2w,home_avg_pace,road_avg_pace,home_avg_pace_l2w,road_avg_pace_l2w,home_avg_oeff,road_avg_oeff,home_avg_oeff_l2w,road_avg_oeff_l2w,home_avg_deff,road_avg_deff,home_avg_deff_l2w,road_avg_deff_l2w,home_avg_eFG%,road_avg_eFG%,home_avg_eFG%_l2w,road_avg_eFG%_l2w,home_avg_TOV%,road_avg_TOV%,home_avg_TOV%_l2w,road_avg_TOV%_l2w,home_avg_ORB%,road_avg_ORB%,home_avg_ORB%_l2w,road_avg_ORB%_l2w,home_avg_FT%,road_avg_FT%,home_avg_FT%_l2w,road_avg_FT%_l2w
0,NBA 2022-2023 Regular Season,22200001,2022-10-18,Boston,24,39,35,28,,,,,,126,240.0,46,82,12,35,22,28,6,30,36,24,24,8,10,11,3,126,98.680535,98.680535,127.684756,118.564416,3+,-4.0,213.5,-3.0,216.5,-150,2023,-2023 Regular Season,"Jaylen Brown,Jayson Tatum,Al Horford,Derrick W...",Philadelphia,29,34,25,29,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,NBA 2022-2023 Regular Season,22200002,2022-10-18,Golden State,25,34,32,32,,,,,,123,240.0,45,99,16,45,17,23,11,37,48,31,23,11,18,18,4,123,114.091809,114.091809,107.807915,95.537095,3+,-6.0,227.5,-7.5,223.5,-306,2023,-2023 Regular Season,"Andrew Wiggins,Draymond Green,Kevon Looney,Kla...",LA Lakers,22,30,19,38,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,NBA 2022-2023 Regular Season,22200014,2022-10-19,Sacramento,23,32,29,24,,,,,,108,240.0,39,85,17,44,13,19,4,37,41,27,25,8,15,16,5,108,101.731855,101.731855,106.161438,113.042271,3+,-1.5,223.5,-3.0,229.5,-149,2023,-2023 Regular Season,"Harrison Barnes,KZ Okpala,Domantas Sabonis,Kev...",Portland,32,19,33,31,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,NBA 2022-2023 Regular Season,22200013,2022-10-19,Phoenix,24,21,31,31,,,,,,107,240.0,40,85,8,22,19,22,8,32,40,25,29,4,12,12,5,107,95.829038,95.829038,111.657178,109.570128,3+,-5.5,216.0,-4.0,218.0,-180,2023,-2023 Regular Season,"Mikal Bridges,Cameron Johnson,Deandre Ayton,De...",Dallas,32,30,19,24,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,NBA 2022-2023 Regular Season,22200012,2022-10-19,Utah,37,38,19,29,,,,,,123,240.0,42,83,16,38,23,31,11,32,43,27,21,10,19,20,1,123,101.120258,101.120258,121.637348,100.869996,3+,7.5,218.0,7.0,225.5,213,2023,-2023 Regular Season,"Lauri Markkanen,Kelly Olynyk,Jarred Vanderbilt...",Denver,30,23,27,22,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
def calculate_pts_allowed_average(df, current_date, team, last_2_weeks=False):
    """
    Calculate the average points allowed by a team up to a specified date.
    Points allowed are the points scored by the opposing team.

    :param df: DataFrame containing the NBA data.
    :param current_date: The date of the current game.
    :param team: The team for which the points allowed is calculated.
    :param last_2_weeks: Boolean, if True, calculate the average for the last two weeks.
    :return: Average points allowed.
    """
    # Filter for games involving the team before the current date
    relevant_games = df[
        (df["date"] < current_date)
        & ((df["home_team"] == team) | (df["road_team"] == team))
    ]

    # Consider only the last 2 weeks if required
    if last_2_weeks:
        two_weeks_ago = current_date - timedelta(days=14)
        relevant_games = relevant_games[relevant_games["date"] >= two_weeks_ago]

    # Calculate the total points allowed by the team
    total_points_allowed = relevant_games.apply(
        lambda row: row["road_pts"] if row["home_team"] == team else row["home_pts"],
        axis=1,
    ).sum()

    # Calculate the average points allowed
    num_games = len(relevant_games)
    return total_points_allowed / num_games if num_games != 0 else 0


def add_pts_allowed_columns(df):
    """
    Adds columns for average points allowed for home and road teams, both overall and for the last two weeks.

    :param df: DataFrame containing the NBA data.
    :return: DataFrame with added columns for points allowed.
    """

    def apply_pts_allowed(row):
        date = row["date"]
        home_team = row["home_team"]
        road_team = row["road_team"]

        # Define new column names
        home_avg_pts_allowed = "home_avg_pts_allowed"
        road_avg_pts_allowed = "road_avg_pts_allowed"
        home_avg_pts_allowed_l2w = "home_avg_pts_allowed_l2w"
        road_avg_pts_allowed_l2w = "road_avg_pts_allowed_l2w"

        # Calculate and assign the averages for points allowed
        row[home_avg_pts_allowed] = calculate_pts_allowed_average(df, date, home_team)
        row[road_avg_pts_allowed] = calculate_pts_allowed_average(df, date, road_team)
        row[home_avg_pts_allowed_l2w] = calculate_pts_allowed_average(
            df, date, home_team, last_2_weeks=True
        )
        row[road_avg_pts_allowed_l2w] = calculate_pts_allowed_average(
            df, date, road_team, last_2_weeks=True
        )

        return row

    # Apply the function to each row of the DataFrame
    return df.apply(apply_pts_allowed, axis=1)

In [24]:
df_8 = add_pts_allowed_columns(df_7)

In [25]:
df_8.head()

Unnamed: 0,bigdataball_dataset,game_id,date,home_team,home_1q,home_2q,home_3q,home_4q,home_ot1,home_ot2,home_ot3,home_ot4,home_ot5,home_f,home_min,home_fg,home_fga,home_3p,home_3pa,home_ft,home_fta,home_or,home_dr,home_tot,home_a,home_pf,home_st,home_to,home_to_to,home_bl,home_pts,home_poss,home_pace,home_oeff,home_deff,home_team_rest_days,home_opening_spread,opening_total,home_closing_spread,closing_total,home_moneyline,season,season_type,home_starting_lineup,road_team,road_1q,road_2q,road_3q,road_4q,road_ot1,...,home_avg_to_l2w,road_avg_to_l2w,home_avg_to_to,road_avg_to_to,home_avg_to_to_l2w,road_avg_to_to_l2w,home_avg_bl,road_avg_bl,home_avg_bl_l2w,road_avg_bl_l2w,home_avg_pts,road_avg_pts,home_avg_pts_l2w,road_avg_pts_l2w,home_avg_poss,road_avg_poss,home_avg_poss_l2w,road_avg_poss_l2w,home_avg_pace,road_avg_pace,home_avg_pace_l2w,road_avg_pace_l2w,home_avg_oeff,road_avg_oeff,home_avg_oeff_l2w,road_avg_oeff_l2w,home_avg_deff,road_avg_deff,home_avg_deff_l2w,road_avg_deff_l2w,home_avg_eFG%,road_avg_eFG%,home_avg_eFG%_l2w,road_avg_eFG%_l2w,home_avg_TOV%,road_avg_TOV%,home_avg_TOV%_l2w,road_avg_TOV%_l2w,home_avg_ORB%,road_avg_ORB%,home_avg_ORB%_l2w,road_avg_ORB%_l2w,home_avg_FT%,road_avg_FT%,home_avg_FT%_l2w,road_avg_FT%_l2w,home_avg_pts_allowed,road_avg_pts_allowed,home_avg_pts_allowed_l2w,road_avg_pts_allowed_l2w
0,NBA 2022-2023 Regular Season,22200001,2022-10-18,Boston,24,39,35,28,,,,,,126,240.0,46,82,12,35,22,28,6,30,36,24,24,8,10,11,3,126,98.680535,98.680535,127.684756,118.564416,3+,-4.0,213.5,-3.0,216.5,-150,2023,-2023 Regular Season,"Jaylen Brown,Jayson Tatum,Al Horford,Derrick W...",Philadelphia,29,34,25,29,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,NBA 2022-2023 Regular Season,22200002,2022-10-18,Golden State,25,34,32,32,,,,,,123,240.0,45,99,16,45,17,23,11,37,48,31,23,11,18,18,4,123,114.091809,114.091809,107.807915,95.537095,3+,-6.0,227.5,-7.5,223.5,-306,2023,-2023 Regular Season,"Andrew Wiggins,Draymond Green,Kevon Looney,Kla...",LA Lakers,22,30,19,38,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,NBA 2022-2023 Regular Season,22200014,2022-10-19,Sacramento,23,32,29,24,,,,,,108,240.0,39,85,17,44,13,19,4,37,41,27,25,8,15,16,5,108,101.731855,101.731855,106.161438,113.042271,3+,-1.5,223.5,-3.0,229.5,-149,2023,-2023 Regular Season,"Harrison Barnes,KZ Okpala,Domantas Sabonis,Kev...",Portland,32,19,33,31,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,NBA 2022-2023 Regular Season,22200013,2022-10-19,Phoenix,24,21,31,31,,,,,,107,240.0,40,85,8,22,19,22,8,32,40,25,29,4,12,12,5,107,95.829038,95.829038,111.657178,109.570128,3+,-5.5,216.0,-4.0,218.0,-180,2023,-2023 Regular Season,"Mikal Bridges,Cameron Johnson,Deandre Ayton,De...",Dallas,32,30,19,24,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,NBA 2022-2023 Regular Season,22200012,2022-10-19,Utah,37,38,19,29,,,,,,123,240.0,42,83,16,38,23,31,11,32,43,27,21,10,19,20,1,123,101.120258,101.120258,121.637348,100.869996,3+,7.5,218.0,7.0,225.5,213,2023,-2023 Regular Season,"Lauri Markkanen,Kelly Olynyk,Jarred Vanderbilt...",Denver,30,23,27,22,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
def add_targets(df):
    """
    Add various target columns to an NBA dataset for betting and game outcome analysis.

    This function calculates several targets based on game statistics and betting lines.

    Parameters:
    df (pd.DataFrame): DataFrame containing NBA game statistics, including points, spreads, and totals.

    Returns:
    pd.DataFrame: The original DataFrame with added target columns for analysis.

    The function adds the following columns:
    - REG_TARGET: Point differential of the game (home points minus road points).
    - CLS_TARGET: Boolean indicating if the home team beat the opening spread.
    - CLS_TARGET_closing_spread: Boolean indicating if the home team beat the closing spread.
    - REG_TARGET_OU: Total points scored in the game (home points plus road points).
    - CLS_TARGET_OU_OPEN: Boolean indicating if the total points exceeded the opening total.
    - CLS_TARGET_OU_CLOSE: Boolean indicating if the total points exceeded the closing total.
    """

    # REG_TARGET: Point differential (home points - road points)
    # It represents the margin of victory or defeat for the home team.
    df["REG_TARGET"] = df["home_pts"] - df["road_pts"]

    # CLS_TARGET: Boolean indicating if the home team covered the opening spread.
    # True if home team's win margin is greater than the negative of the opening spread.
    # It's used to determine if the home team performed better than the pre-game expectations.
    df["CLS_TARGET"] = df["REG_TARGET"] > -df["home_opening_spread"]

    # CLS_TARGET_closing_spread: Similar to CLS_TARGET but using the closing spread.
    # It reflects the home team's performance against the final betting line before the game.
    df["CLS_TARGET_closing_spread"] = df["REG_TARGET"] > -df["home_closing_spread"]

    # REG_TARGET_OU: Sum of home and road points, indicating total points scored in the game.
    df["REG_TARGET_OU"] = df["home_pts"] + df["road_pts"]

    # CLS_TARGET_OU_OPEN: Boolean indicating if total points scored exceeded the opening total line.
    # It shows whether the game was higher-scoring than initially expected by bookmakers.
    df["CLS_TARGET_OU_OPEN"] = df["REG_TARGET_OU"] > df["opening_total"]

    # CLS_TARGET_OU_CLOSE: Similar to CLS_TARGET_OU_OPEN but with the closing total line.
    # It reflects whether the game's total score surpassed the final total points line set before the game.
    df["CLS_TARGET_OU_CLOSE"] = df["REG_TARGET_OU"] > df["closing_total"]

    return df

In [27]:
df_9 = add_targets(df_8)

  df["REG_TARGET"] = df["home_pts"] - df["road_pts"]
  df["CLS_TARGET"] = df["REG_TARGET"] > -df["home_opening_spread"]
  df["CLS_TARGET_closing_spread"] = df["REG_TARGET"] > -df["home_closing_spread"]
  df["REG_TARGET_OU"] = df["home_pts"] + df["road_pts"]
  df["CLS_TARGET_OU_OPEN"] = df["REG_TARGET_OU"] > df["opening_total"]
  df["CLS_TARGET_OU_CLOSE"] = df["REG_TARGET_OU"] > df["closing_total"]


In [28]:
df_9.head()

Unnamed: 0,bigdataball_dataset,game_id,date,home_team,home_1q,home_2q,home_3q,home_4q,home_ot1,home_ot2,home_ot3,home_ot4,home_ot5,home_f,home_min,home_fg,home_fga,home_3p,home_3pa,home_ft,home_fta,home_or,home_dr,home_tot,home_a,home_pf,home_st,home_to,home_to_to,home_bl,home_pts,home_poss,home_pace,home_oeff,home_deff,home_team_rest_days,home_opening_spread,opening_total,home_closing_spread,closing_total,home_moneyline,season,season_type,home_starting_lineup,road_team,road_1q,road_2q,road_3q,road_4q,road_ot1,...,home_avg_bl,road_avg_bl,home_avg_bl_l2w,road_avg_bl_l2w,home_avg_pts,road_avg_pts,home_avg_pts_l2w,road_avg_pts_l2w,home_avg_poss,road_avg_poss,home_avg_poss_l2w,road_avg_poss_l2w,home_avg_pace,road_avg_pace,home_avg_pace_l2w,road_avg_pace_l2w,home_avg_oeff,road_avg_oeff,home_avg_oeff_l2w,road_avg_oeff_l2w,home_avg_deff,road_avg_deff,home_avg_deff_l2w,road_avg_deff_l2w,home_avg_eFG%,road_avg_eFG%,home_avg_eFG%_l2w,road_avg_eFG%_l2w,home_avg_TOV%,road_avg_TOV%,home_avg_TOV%_l2w,road_avg_TOV%_l2w,home_avg_ORB%,road_avg_ORB%,home_avg_ORB%_l2w,road_avg_ORB%_l2w,home_avg_FT%,road_avg_FT%,home_avg_FT%_l2w,road_avg_FT%_l2w,home_avg_pts_allowed,road_avg_pts_allowed,home_avg_pts_allowed_l2w,road_avg_pts_allowed_l2w,REG_TARGET,CLS_TARGET,CLS_TARGET_closing_spread,REG_TARGET_OU,CLS_TARGET_OU_OPEN,CLS_TARGET_OU_CLOSE
0,NBA 2022-2023 Regular Season,22200001,2022-10-18,Boston,24,39,35,28,,,,,,126,240.0,46,82,12,35,22,28,6,30,36,24,24,8,10,11,3,126,98.680535,98.680535,127.684756,118.564416,3+,-4.0,213.5,-3.0,216.5,-150,2023,-2023 Regular Season,"Jaylen Brown,Jayson Tatum,Al Horford,Derrick W...",Philadelphia,29,34,25,29,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9,True,True,243,True,True
1,NBA 2022-2023 Regular Season,22200002,2022-10-18,Golden State,25,34,32,32,,,,,,123,240.0,45,99,16,45,17,23,11,37,48,31,23,11,18,18,4,123,114.091809,114.091809,107.807915,95.537095,3+,-6.0,227.5,-7.5,223.5,-306,2023,-2023 Regular Season,"Andrew Wiggins,Draymond Green,Kevon Looney,Kla...",LA Lakers,22,30,19,38,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14,True,True,232,True,True
13,NBA 2022-2023 Regular Season,22200014,2022-10-19,Sacramento,23,32,29,24,,,,,,108,240.0,39,85,17,44,13,19,4,37,41,27,25,8,15,16,5,108,101.731855,101.731855,106.161438,113.042271,3+,-1.5,223.5,-3.0,229.5,-149,2023,-2023 Regular Season,"Harrison Barnes,KZ Okpala,Domantas Sabonis,Kev...",Portland,32,19,33,31,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-7,False,False,223,False,False
12,NBA 2022-2023 Regular Season,22200013,2022-10-19,Phoenix,24,21,31,31,,,,,,107,240.0,40,85,8,22,19,22,8,32,40,25,29,4,12,12,5,107,95.829038,95.829038,111.657178,109.570128,3+,-5.5,216.0,-4.0,218.0,-180,2023,-2023 Regular Season,"Mikal Bridges,Cameron Johnson,Deandre Ayton,De...",Dallas,32,30,19,24,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,False,False,212,False,False
11,NBA 2022-2023 Regular Season,22200012,2022-10-19,Utah,37,38,19,29,,,,,,123,240.0,42,83,16,38,23,31,11,32,43,27,21,10,19,20,1,123,101.120258,101.120258,121.637348,100.869996,3+,7.5,218.0,7.0,225.5,213,2023,-2023 Regular Season,"Lauri Markkanen,Kelly Olynyk,Jarred Vanderbilt...",Denver,30,23,27,22,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21,True,True,225,True,False


In [29]:
def clean_odds_columns(df):
    """
    Cleans and converts betting odds columns in the dataframe to numeric format.

    The function handles specific cases in the odds columns:
    - Converts variations of 'even' or 'pk' (regardless of capitalization) to 0 in
      'home_opening_spread' and 'home_closing_spread' columns.
    - Converts values like '+XXX' or '-XXX' to numeric in 'home_moneyline' and 'road_moneyline' columns,
      also handling variations of 'even'.

    Parameters:
    df (DataFrame): The input dataframe with the odds columns.

    Returns:
    DataFrame: The dataframe with cleaned and numeric odds columns.
    """
    # Handling 'home_opening_spread' and 'home_closing_spread'
    # Replace variations of 'even' and 'pk' with 0, accounting for different capitalizations
    df["home_opening_spread"] = (
        df["home_opening_spread"]
        .replace(["even", "pk", "Even", "EVEN", "Pk", "PK"], 0)
        .astype(float)
    )
    df["home_closing_spread"] = (
        df["home_closing_spread"]
        .replace(["even", "pk", "Even", "EVEN", "Pk", "PK"], 0)
        .astype(float)
    )

    # Function to convert moneyline values to numeric
    def moneyline_to_numeric(value):
        if str(value).lower() in ["even", "pk"]:
            return 0
        elif isinstance(value, str) and (
            value.startswith("+") or value.startswith("-")
        ):
            return int(value)
        else:
            return pd.to_numeric(value, errors="coerce")

    # Apply the conversion function to 'home_moneyline' and 'road_moneyline'
    df["home_moneyline"] = df["home_moneyline"].apply(moneyline_to_numeric)
    df["road_moneyline"] = df["road_moneyline"].apply(moneyline_to_numeric)

    return df

In [30]:
df_10 = clean_odds_columns(df_9)

In [31]:
def encode_rest_days(df):
    """
    Encode 'home_team_rest_days' and 'road_team_rest_days' in the DataFrame with ordinal values.

    This function maps the rest day categories to ordinal numbers based on the amount of rest.
    '3+' indicating the most rest is mapped to the highest ordinal number, and '4IN5-B2B'
    indicating the least rest is mapped to the lowest ordinal number.

    Parameters:
    df (DataFrame): DataFrame containing 'home_team_rest_days' and 'road_team_rest_days' columns.

    Returns:
    DataFrame: Modified DataFrame with encoded rest day columns.
    """
    # Mapping from rest days categories to ordinal values
    rest_days_mapping = {
        "3+": 7,  # Most Rest
        2: 6,  # 2nd Most Rest
        1: 5,  # 3rd Most Rest
        "3IN4": 4,  # 4th Most Rest
        "B2B": 3,  # 5th Most Rest
        "3IN4-B2B": 2,  # 6th Most Rest
        "4IN5-B2B": 1,  # 7th Most Rest
    }

    # Apply the mapping to the DataFrame
    df["home_team_rest"] = df["home_team_rest_days"].map(rest_days_mapping)
    df["road_team_rest"] = df["road_team_rest_days"].map(rest_days_mapping)

    return df

In [32]:
df_11 = encode_rest_days(df_10)

  df["home_team_rest"] = df["home_team_rest_days"].map(rest_days_mapping)
  df["road_team_rest"] = df["road_team_rest_days"].map(rest_days_mapping)


In [33]:
def encode_lineups_to_vectors(df, existing_mapping=None):
    """
    Convert 'home_starting_lineup' and 'road_starting_lineup' in a DataFrame to 5-hot encoded vectors with an
    additional flag for unknown players, using an existing mapping if provided.

    Parameters:
    df (DataFrame): DataFrame containing 'home_starting_lineup' and 'road_starting_lineup' columns.
    existing_mapping (dict, optional): Mapping from player names to indices. If None, a new mapping is created.

    Returns:
    DataFrame, dict: Modified DataFrame with two new columns 'home_lineup_vector' and 'road_lineup_vector' containing
                     the 5-hot encoded vectors, and the mapping used for the encoding.
    """
    if existing_mapping is None:
        # Extract all unique player names from the lineups
        all_lineups = (
            df["home_starting_lineup"].tolist() + df["road_starting_lineup"].tolist()
        )
        all_players = set(
            player for lineup in all_lineups for player in lineup.split(",")
        )
        all_players.add("unknown")  # Add 'unknown' player

        # Create a mapping from player names to indices
        player_to_index = {player: i for i, player in enumerate(sorted(all_players))}
    else:
        player_to_index = existing_mapping

    def lineup_to_vector(lineup):
        vector = np.zeros(len(player_to_index))
        unknown = True

        for player in lineup.split(","):
            if player in player_to_index:
                vector[player_to_index[player]] = 1
                unknown = False

        if unknown:
            vector[player_to_index["unknown"]] = 1

        return vector

    df["home_lineup_vector"] = df["home_starting_lineup"].apply(lineup_to_vector)
    df["road_lineup_vector"] = df["road_starting_lineup"].apply(lineup_to_vector)

    return df, player_to_index

In [34]:
# Load the mapping from the file
with open("mapping.json", "r") as file:
    mapping = json.load(file)

In [35]:
# Usage during original training
# df_12, mapping = encode_lineups_to_vectors(df_11)

# Usage during testing (using the saved mapping)
df_12, _ = encode_lineups_to_vectors(df_11, existing_mapping=mapping)

  df["home_lineup_vector"] = df["home_starting_lineup"].apply(lineup_to_vector)
  df["road_lineup_vector"] = df["road_starting_lineup"].apply(lineup_to_vector)


In [36]:
# Convert the mapping to a JSON string and save it to a file
# with open('mapping.json', 'w') as file:
#     json.dump(mapping, file)

In [None]:
df_12.to_csv("../data/nba_ai/cleaned_data_2022-2023.csv", index=False)