# NBA AI - Data Loading and Cleaning

### Imports and Global Settings

In [1]:
import pandas as pd
import re
from datetime import timedelta

pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 1000)

In [2]:
def load_and_clean_nba_data(file_path):
    """
    Loads and cleans NBA game data from the 'NBA_Box_Score_Team-Stats.xlsx' file provided by BIGDATABALL.

    This function is tailored to handle the specific format of the BIGDATABALL NBA dataset. It performs the following steps:
    1. Loads the first sheet of the 'NBA_Box_Score_Team-Stats.xlsx' file into a Pandas DataFrame.
    2. Converts column names to snake case for consistency and easier access.
    3. Converts the 'date' column to a Pandas datetime format for proper time series analysis.
    4. Concatenates player names in the starting lineup columns into a single 'starting_lineup' column.
    5. Drops the original starting lineup columns and other specified columns like 'box_score_url' and 'full_game_odds_url'.

    Parameters:
    - file_path (str): The file path of the 'NBA_Box_Score_Team-Stats.xlsx' file to be loaded.

    Returns:
    - pandas.DataFrame: A cleaned DataFrame containing the NBA game data from BIGDATABALL.
    """

    # Helper function to convert column names to snake case
    def to_snake_case(name):
        name = re.sub(r"\s+", "_", name)
        name = re.sub(r"\W+", "_", name)
        name = re.sub(r"_+", "_", name)
        name = re.sub(r"^_|_$", "", name)
        return name.lower()

    # Load the first sheet into a DataFrame
    df = pd.read_excel(file_path, sheet_name=0)

    # Rename columns to snake case
    df.columns = [to_snake_case(col) for col in df.columns]

    # Convert 'date' column to datetime
    df["date"] = pd.to_datetime(df["date"])

    # Extract the season by finding the last occurrence of four digits
    df["season"] = df["bigdataball_dataset"].str.extract(r"(\d{4})(?!.*\d)")

    # Extract the season type by taking everything after the last four digits
    df["season_type"] = df["bigdataball_dataset"].str.extract(r"(\d{4})\s*(.*)")[1]

    # Filter columns that start with 'unnamed'
    unnamed_columns = [col for col in df.columns if col.startswith("unnamed")]

    # Ensure that the 'starting_lineups' column is included
    columns_to_concatenate = ["starting_lineups"] + unnamed_columns

    # Concatenate the columns to create the full starting lineup
    df["starting_lineup"] = df[columns_to_concatenate].apply(
        lambda row: ",".join(row.dropna().astype(str)), axis=1
    )

    # Drop the original lineup columns
    df.drop(columns=columns_to_concatenate, inplace=True)

    # Remove the 'box_score_url' and 'full_game_odds_url' columns
    df = df.drop(["box_score_url", "full_game_odds_url"], axis=1)

    return df

In [3]:
df_1 = load_and_clean_nba_data("../data/nba_ai/2021-2022_NBA_Box_Score_Team-Stats.xlsx")

In [4]:
df_1.head()

Unnamed: 0,bigdataball_dataset,game_id,date,team,venue,1q,2q,3q,4q,ot1,ot2,ot3,ot4,ot5,f,min,fg,fga,3p,3pa,ft,fta,or,dr,tot,a,pf,st,to,to_to,bl,pts,poss,pace,oeff,deff,team_rest_days,main_ref,crew,opening_odds,opening_spread,opening_total,line_movement_1,line_movement_2,line_movement_3,closing_odds,closing_spread,closing_total,moneyline,halftime,season,season_type,starting_lineup
0,NBA 2021-2022 Regular Season,22100001,2021-10-19,Brooklyn,R,25,34,26,19,,,,,,104,240,37,84,17,32,13,23,5,39,44,19,17,3,12,13,9,104,102.843098,102.843098,101.12492,123.489085,3+,Josh Tiven,Jacyn Goble,240.5o -10,1.5,240.5,234.5u,234u,234u,234o -09,2.0,234.0,105,-2 -10,2022,-2022 Regular Season,"Kevin Durant,Blake Griffin,Nic Claxton,Joe Har..."
1,NBA 2021-2022 Regular Season,22100001,2021-10-19,Milwaukee,H,37,29,31,30,,,,,,127,240,48,105,17,45,14,18,13,41,54,25,19,8,7,8,9,127,102.843098,102.843098,123.489085,101.12492,3+,,Natalie Sago,-1.5 -12,-1.5,240.5,-1.5,-1,-1.5,-2 -08,-2.0,234.0,-126,114.5u -15,2022,-2022 Regular Season,"Khris Middleton,Giannis Antetokounmpo,Brook Lo..."
2,NBA 2021-2022 Regular Season,22100002,2021-10-19,Golden State,R,32,21,30,38,,,,,,121,240,41,93,14,39,25,30,9,41,50,30,18,9,17,17,2,121,113.282595,113.282595,106.812525,100.633288,3+,Sean Wright,Ray Acosta,230.5o -12,5.5,230.5,226o,226.5o,o226.5,226.5o -12,3.0,226.5,130,PK -05,2022,-2022 Regular Season,"Andrew Wiggins,Draymond Green,Kevon Looney,Jor..."
3,NBA 2021-2022 Regular Season,22100002,2021-10-19,LA Lakers,H,34,25,26,29,,,,,,114,240,45,95,15,42,9,19,5,40,45,21,25,7,17,18,4,114,113.282595,113.282595,100.633288,106.812525,3+,,Mark Lindsay,-5.5 -08,-5.5,230.5,-3,-3,-3,-3 -10,-3.0,226.5,-154,111.5o -20,2022,-2022 Regular Season,"LeBron James,Anthony Davis,DeAndre Jordan,Kent..."
4,NBA 2021-2022 Regular Season,22100003,2021-10-20,Indiana,R,38,37,13,34,,,,,,122,240,42,90,17,47,21,24,8,43,51,29,24,2,16,17,10,122,109.188208,109.188208,111.733678,112.649527,3+,Rodney Mott,Nate Green,+2 -08,2.0,228.5,'+1,+0.5,-1,-1.5 -12,-1.5,222.5,-122,+4.5 -17,2022,-2022 Regular Season,"Justin Holiday,Domantas Sabonis,Myles Turner,C..."


In [5]:
def merge_and_transform_nba_data(df):
    """
    Transforms the cleaned NBA DataFrame to create a single record for each game with updated handling of referee and odds information.

    This function:
    1. Splits the DataFrame into separate DataFrames for home and road teams.
    2. Renames and prefixes columns to indicate home or road team.
    3. Merges these DataFrames to create a single row per game.
    4. Removes the 'home_venue' and 'road_venue' columns as they are redundant.
    5. Combines the 'main_ref/crew_chief' and 'crew/referee_umpire' columns from both home and road records.
    6. Removes unused betting data columns.

    Parameters:
    df (pandas.DataFrame): The cleaned DataFrame containing NBA data.

    Returns:
    pandas.DataFrame: A transformed DataFrame with one row per game, combined referee information, and betting data.
    """

    # Splitting the DataFrame into home and road teams
    home_df = df[df["venue"] == "H"].copy()
    road_df = df[df["venue"] == "R"].copy()

    # Renaming columns for clarity
    home_df.rename(columns={"team": "home_team"}, inplace=True)
    road_df.rename(columns={"team": "road_team"}, inplace=True)

    # Adding a prefix to all relevant columns
    for col in home_df.columns:
        if col not in [
            "bigdataball_dataset",
            "game_id",
            "date",
            "season",
            "season_type",
            "home_team",
        ]:
            home_df.rename(columns={col: "home_" + col}, inplace=True)

    for col in road_df.columns:
        if col not in [
            "bigdataball_dataset",
            "game_id",
            "date",
            "season",
            "season_type",
            "road_team",
        ]:
            road_df.rename(columns={col: "road_" + col}, inplace=True)

    # Merging the DataFrames on common columns
    merged_df = pd.merge(
        home_df,
        road_df,
        left_on=["bigdataball_dataset", "game_id", "date", "season", "season_type"],
        right_on=["bigdataball_dataset", "game_id", "date", "season", "season_type"],
    )

    # Removing the 'home_venue' and 'road_venue' columns
    merged_df.drop(["home_venue", "road_venue"], axis=1, inplace=True)

    # Check which set of columns is present and set variables accordingly
    if "home_main_ref" in merged_df.columns and "road_main_ref" in merged_df.columns:
        main_ref_cols = ["home_main_ref", "road_main_ref"]
        crew_cols = ["home_crew", "road_crew"]
        main_ref_output = "main_ref"
        crew_output = "crew"
    elif (
        "home_crew_chief" in merged_df.columns
        and "road_crew_chief" in merged_df.columns
    ):
        main_ref_cols = ["home_crew_chief", "road_crew_chief"]
        crew_cols = ["home_referee_umpire", "road_referee_umpire"]
        main_ref_output = "crew_chief"
        crew_output = "referee_umpire"
    else:
        raise ValueError("Expected columns not found in DataFrame")

    # Process main_ref/crew_chief columns
    merged_df[main_ref_output] = merged_df.apply(
        lambda x: x[main_ref_cols[0]]
        if pd.notna(x[main_ref_cols[0]])
        else x[main_ref_cols[1]],
        axis=1,
    )
    merged_df.drop(main_ref_cols, axis=1, inplace=True)

    # Combine crew/referee_umpire columns
    def combine_crew(crew1, crew2):
        all_crew = set(crew1.split(",")) | set(crew2.split(","))
        return ",".join(sorted(all_crew - {""}))

    merged_df[crew_output] = merged_df.apply(
        lambda x: combine_crew(x[crew_cols[0]], x[crew_cols[1]]),
        axis=1,
    )
    merged_df.drop(crew_cols, axis=1, inplace=True)

    # Remove unused betting data columns
    merged_df = merged_df.rename(
        columns={
            "home_opening_total": "opening_total",
            "home_closing_total": "closing_total",
        }
    )

    unused_odds_columns = [
        "home_line_movement_1",
        "home_line_movement_2",
        "home_line_movement_3",
        "road_line_movement_1",
        "road_line_movement_2",
        "road_line_movement_3",
        "home_halftime",
        "road_halftime",
        "home_opening_odds",
        "road_opening_odds",
        "home_closing_odds",
        "road_closing_odds",
        "road_opening_total",
        "road_closing_total",
        "road_opening_spread",
        "road_closing_spread",
    ]
    merged_df.drop(unused_odds_columns, axis=1, inplace=True)

    return merged_df

In [6]:
df_2 = merge_and_transform_nba_data(df_1)

In [7]:
df_2.head()

Unnamed: 0,bigdataball_dataset,game_id,date,home_team,home_1q,home_2q,home_3q,home_4q,home_ot1,home_ot2,home_ot3,home_ot4,home_ot5,home_f,home_min,home_fg,home_fga,home_3p,home_3pa,home_ft,home_fta,home_or,home_dr,home_tot,home_a,home_pf,home_st,home_to,home_to_to,home_bl,home_pts,home_poss,home_pace,home_oeff,home_deff,home_team_rest_days,home_opening_spread,opening_total,home_closing_spread,closing_total,home_moneyline,season,season_type,home_starting_lineup,road_team,road_1q,road_2q,road_3q,road_4q,road_ot1,road_ot2,road_ot3,road_ot4,road_ot5,road_f,road_min,road_fg,road_fga,road_3p,road_3pa,road_ft,road_fta,road_or,road_dr,road_tot,road_a,road_pf,road_st,road_to,road_to_to,road_bl,road_pts,road_poss,road_pace,road_oeff,road_deff,road_team_rest_days,road_moneyline,road_starting_lineup,main_ref,crew
0,NBA 2021-2022 Regular Season,22100001,2021-10-19,Milwaukee,37,29,31,30,,,,,,127,240,48,105,17,45,14,18,13,41,54,25,19,8,7,8,9,127,102.843098,102.843098,123.489085,101.12492,3+,-1.5,240.5,-2.0,234.0,-126,2022,-2022 Regular Season,"Khris Middleton,Giannis Antetokounmpo,Brook Lo...",Brooklyn,25,34,26,19,,,,,,104,240,37,84,17,32,13,23,5,39,44,19,17,3,12,13,9,104,102.843098,102.843098,101.12492,123.489085,3+,105,"Kevin Durant,Blake Griffin,Nic Claxton,Joe Har...",Josh Tiven,"Jacyn Goble,Natalie Sago"
1,NBA 2021-2022 Regular Season,22100002,2021-10-19,LA Lakers,34,25,26,29,,,,,,114,240,45,95,15,42,9,19,5,40,45,21,25,7,17,18,4,114,113.282595,113.282595,100.633288,106.812525,3+,-5.5,230.5,-3.0,226.5,-154,2022,-2022 Regular Season,"LeBron James,Anthony Davis,DeAndre Jordan,Kent...",Golden State,32,21,30,38,,,,,,121,240,41,93,14,39,25,30,9,41,50,30,18,9,17,17,2,121,113.282595,113.282595,106.812525,100.633288,3+,130,"Andrew Wiggins,Draymond Green,Kevon Looney,Jor...",Sean Wright,"Mark Lindsay,Ray Acosta"
2,NBA 2021-2022 Regular Season,22100003,2021-10-20,Charlotte,27,32,33,31,,,,,,123,240,46,107,13,31,18,27,12,34,46,29,21,9,8,8,5,123,109.188208,109.188208,112.649527,111.733678,3+,-2.0,228.5,1.5,222.5,102,2022,-2022 Regular Season,"Gordon Hayward,Miles Bridges,Mason Plumlee,Kel...",Indiana,38,37,13,34,,,,,,122,240,42,90,17,47,21,24,8,43,51,29,24,2,16,17,10,122,109.188208,109.188208,111.733678,112.649527,3+,-122,"Justin Holiday,Domantas Sabonis,Myles Turner,C...",Rodney Mott,"Nate Green,Scott Wall"
3,NBA 2021-2022 Regular Season,22100004,2021-10-20,Detroit,20,24,25,19,,,,,,88,240,36,90,6,28,10,13,11,36,47,17,16,7,16,17,5,88,99.0012,99.0012,88.887811,94.948344,3+,2.5,220.5,5.5,218.0,182,2022,-2022 Regular Season,"Saddiq Bey,Jerami Grant,Isaiah Stewart,Frank J...",Chicago,14,26,31,23,,,,,,94,240,37,86,7,23,13,15,9,39,48,18,19,8,17,17,5,94,99.0012,99.0012,94.948344,88.887811,3+,-227,"DeMar DeRozan,Patrick Williams,Nikola Vucevic,...",Derek Richardson,"Mousa Dagher,Tyler Ford"
4,NBA 2021-2022 Regular Season,22100005,2021-10-20,New York,29,25,32,30,12.0,10.0,,,,138,290,51,105,17,45,19,27,7,48,55,27,22,9,17,19,10,138,126.497589,104.68766,109.092988,105.930872,3+,-1.0,217.5,-2.5,219.5,-138,2022,-2022 Regular Season,"RJ Barrett,Julius Randle,Mitchell Robinson,Eva...",Boston,35,23,24,34,12.0,6.0,,,,134,290,48,117,21,57,17,23,15,41,56,34,24,13,18,18,9,134,126.497589,104.68766,105.930872,109.092988,3+,116,"Jayson Tatum,Grant Williams,Robert Williams II...",Scott Foster,"Ed Malloy,Lauren Holtkamp"


In [8]:
def add_sequence_data(df):
    # Calculate the 'Day of Season'
    df["day_of_season"] = (df["date"] - df["date"].min()).dt.days + 1

    df = df.sort_values(by="date")

    # Function to calculate the game number
    def calculate_game_number(row, team_column, df):
        return len(
            df[
                (
                    (df["home_team"] == row[team_column])
                    | (df["road_team"] == row[team_column])
                )
                & (df["date"] <= row["date"])
            ]
        )

    # Calculate game numbers
    df["home_team_game_num"] = df.apply(
        calculate_game_number, team_column="home_team", df=df, axis=1
    )
    df["road_team_game_num"] = df.apply(
        calculate_game_number, team_column="road_team", df=df, axis=1
    )

    return df

In [9]:
df_3 = add_sequence_data(df_2)

In [10]:
df_3.head()

Unnamed: 0,bigdataball_dataset,game_id,date,home_team,home_1q,home_2q,home_3q,home_4q,home_ot1,home_ot2,home_ot3,home_ot4,home_ot5,home_f,home_min,home_fg,home_fga,home_3p,home_3pa,home_ft,home_fta,home_or,home_dr,home_tot,home_a,home_pf,home_st,home_to,home_to_to,home_bl,home_pts,home_poss,home_pace,home_oeff,home_deff,home_team_rest_days,home_opening_spread,opening_total,home_closing_spread,closing_total,home_moneyline,season,season_type,home_starting_lineup,road_team,road_1q,road_2q,road_3q,road_4q,road_ot1,road_ot2,road_ot3,road_ot4,road_ot5,road_f,road_min,road_fg,road_fga,road_3p,road_3pa,road_ft,road_fta,road_or,road_dr,road_tot,road_a,road_pf,road_st,road_to,road_to_to,road_bl,road_pts,road_poss,road_pace,road_oeff,road_deff,road_team_rest_days,road_moneyline,road_starting_lineup,main_ref,crew,day_of_season,home_team_game_num,road_team_game_num
0,NBA 2021-2022 Regular Season,22100001,2021-10-19,Milwaukee,37,29,31,30,,,,,,127,240,48,105,17,45,14,18,13,41,54,25,19,8,7,8,9,127,102.843098,102.843098,123.489085,101.12492,3+,-1.5,240.5,-2.0,234.0,-126,2022,-2022 Regular Season,"Khris Middleton,Giannis Antetokounmpo,Brook Lo...",Brooklyn,25,34,26,19,,,,,,104,240,37,84,17,32,13,23,5,39,44,19,17,3,12,13,9,104,102.843098,102.843098,101.12492,123.489085,3+,105,"Kevin Durant,Blake Griffin,Nic Claxton,Joe Har...",Josh Tiven,"Jacyn Goble,Natalie Sago",1,1,1
1,NBA 2021-2022 Regular Season,22100002,2021-10-19,LA Lakers,34,25,26,29,,,,,,114,240,45,95,15,42,9,19,5,40,45,21,25,7,17,18,4,114,113.282595,113.282595,100.633288,106.812525,3+,-5.5,230.5,-3.0,226.5,-154,2022,-2022 Regular Season,"LeBron James,Anthony Davis,DeAndre Jordan,Kent...",Golden State,32,21,30,38,,,,,,121,240,41,93,14,39,25,30,9,41,50,30,18,9,17,17,2,121,113.282595,113.282595,106.812525,100.633288,3+,130,"Andrew Wiggins,Draymond Green,Kevon Looney,Jor...",Sean Wright,"Mark Lindsay,Ray Acosta",1,1,1
12,NBA 2021-2022 Regular Season,22100013,2021-10-20,Portland,23,26,36,36,,,,,,121,240,45,93,12,35,19,22,9,40,49,25,22,5,12,13,5,121,105.079957,105.079957,115.150408,118.005377,3+,-5.5,231.5,-6.5,234.0,-255,2022,-2022 Regular Season,"Norman Powell,Robert Covington,Jusuf Nurkic,CJ...",Sacramento,24,38,38,24,,,,,,124,240,42,92,17,41,23,29,7,36,43,24,22,6,10,10,4,124,105.079957,105.079957,118.005377,115.150408,3+,208,"Harrison Barnes,Maurice Harkless,Richaun Holme...",Sean Wright,"Nick Buchert,Phenizee Ransom",2,1,1
11,NBA 2021-2022 Regular Season,22100012,2021-10-20,Phoenix,20,38,24,16,,,,,,98,240,36,87,14,37,12,17,11,34,45,23,18,9,18,18,3,98,99.68525,99.68525,98.309429,110.347318,3+,-6.5,224.5,-6.0,224.0,-240,2022,-2022 Regular Season,"Mikal Bridges,Jae Crowder,Deandre Ayton,Devin ...",Denver,26,25,34,25,,,,,,110,240,44,83,17,39,5,9,6,40,46,25,20,9,17,19,1,110,99.68525,99.68525,110.347318,98.309429,3+,190,"Michael Porter Jr.,Aaron Gordon,Nikola Jokic,W...",Leon Wood,"Kevin Cutler,Marc Davis",2,1,1
10,NBA 2021-2022 Regular Season,22100011,2021-10-20,Utah,27,27,29,24,,,,,,107,240,40,91,14,47,13,15,12,41,53,18,19,6,10,10,5,107,94.965313,94.965313,112.672718,90.559381,3+,-11.5,222.0,-13.5,221.5,-1428,2022,-2022 Regular Season,"Bojan Bogdanovic,Royce O'Neale,Rudy Gobert,Don...",Oklahoma City,18,24,21,23,,,,,,86,240,34,91,7,35,11,18,15,35,50,19,15,4,14,15,2,86,94.965313,94.965313,90.559381,112.672718,3+,790,"Luguentz Dort,Darius Bazley,Derrick Favors,Jos...",Zach Zarba,"Mark Lindsay,Ray Acosta",2,1,1


In [11]:
def add_four_factors(df):
    """
    Add the Four Factors columns to an NBA game statistics DataFrame.

    This function calculates the Four Factors (Effective Field Goal Percentage,
    Turnover Rate, Offensive Rebound Rate, Free Throw Rate) for both home and road teams
    and adds these as new columns to the provided DataFrame.

    Parameters:
    df (pd.DataFrame): A DataFrame containing NBA game statistics.

    Returns:
    pd.DataFrame: The original DataFrame with added columns for the Four Factors
                  for both home and road teams.
    """

    # Define the column mappings for home and road teams
    stats_mapping = {
        "home": {
            "fgm": "home_fg",
            "fga": "home_fga",
            "3pm": "home_3p",
            "ftm": "home_ft",
            "fta": "home_fta",
            "orb": "home_or",
            "drb": "home_dr",
            "tov": "home_to_to",
        },
        "road": {
            "fgm": "road_fg",
            "fga": "road_fga",
            "3pm": "road_3p",
            "ftm": "road_ft",
            "fta": "road_fta",
            "orb": "road_or",
            "drb": "road_dr",
            "tov": "road_to_to",
        },
    }

    # Function to calculate the Four Factors for a given team type
    def calculate_four_factors(df, team_type):
        factors = {}
        stats = stats_mapping[team_type]

        # eFG%
        factors["eFG%"] = (df[stats["fgm"]] + 0.5 * df[stats["3pm"]]) / df[stats["fga"]]

        # TOV%
        factors["TOV%"] = df[stats["tov"]] / (
            df[stats["fga"]] + 0.44 * df[stats["fta"]] + df[stats["tov"]]
        )

        # ORB%
        factors["ORB%"] = df[stats["orb"]] / (
            df[stats["orb"]] + df[stats["drb"]].shift(-1)
        )

        # FT%
        factors["FT%"] = df[stats["ftm"]] / df[stats["fga"]]

        return pd.DataFrame(factors)

    # Calculate Four Factors for both home and road teams
    home_factors = calculate_four_factors(df, "home")
    road_factors = calculate_four_factors(df, "road")

    # Combine the results and add them to the original DataFrame
    four_factors = pd.concat(
        [home_factors.add_prefix("home_"), road_factors.add_prefix("road_")], axis=1
    )
    return pd.concat([df, four_factors], axis=1)

In [12]:
df_4 = add_four_factors(df_3)

In [13]:
df_4.head()

Unnamed: 0,bigdataball_dataset,game_id,date,home_team,home_1q,home_2q,home_3q,home_4q,home_ot1,home_ot2,home_ot3,home_ot4,home_ot5,home_f,home_min,home_fg,home_fga,home_3p,home_3pa,home_ft,home_fta,home_or,home_dr,home_tot,home_a,home_pf,home_st,home_to,home_to_to,home_bl,home_pts,home_poss,home_pace,home_oeff,home_deff,home_team_rest_days,home_opening_spread,opening_total,home_closing_spread,closing_total,home_moneyline,season,season_type,home_starting_lineup,road_team,road_1q,road_2q,road_3q,road_4q,road_ot1,road_ot2,road_ot3,road_ot4,road_ot5,road_f,road_min,road_fg,road_fga,road_3p,road_3pa,road_ft,road_fta,road_or,road_dr,road_tot,road_a,road_pf,road_st,road_to,road_to_to,road_bl,road_pts,road_poss,road_pace,road_oeff,road_deff,road_team_rest_days,road_moneyline,road_starting_lineup,main_ref,crew,day_of_season,home_team_game_num,road_team_game_num,home_eFG%,home_TOV%,home_ORB%,home_FT%,road_eFG%,road_TOV%,road_ORB%,road_FT%
0,NBA 2021-2022 Regular Season,22100001,2021-10-19,Milwaukee,37,29,31,30,,,,,,127,240,48,105,17,45,14,18,13,41,54,25,19,8,7,8,9,127,102.843098,102.843098,123.489085,101.12492,3+,-1.5,240.5,-2.0,234.0,-126,2022,-2022 Regular Season,"Khris Middleton,Giannis Antetokounmpo,Brook Lo...",Brooklyn,25,34,26,19,,,,,,104,240,37,84,17,32,13,23,5,39,44,19,17,3,12,13,9,104,102.843098,102.843098,101.12492,123.489085,3+,105,"Kevin Durant,Blake Griffin,Nic Claxton,Joe Har...",Josh Tiven,"Jacyn Goble,Natalie Sago",1,1,1,0.538095,0.066159,0.245283,0.133333,0.541667,0.121359,0.108696,0.154762
1,NBA 2021-2022 Regular Season,22100002,2021-10-19,LA Lakers,34,25,26,29,,,,,,114,240,45,95,15,42,9,19,5,40,45,21,25,7,17,18,4,114,113.282595,113.282595,100.633288,106.812525,3+,-5.5,230.5,-3.0,226.5,-154,2022,-2022 Regular Season,"LeBron James,Anthony Davis,DeAndre Jordan,Kent...",Golden State,32,21,30,38,,,,,,121,240,41,93,14,39,25,30,9,41,50,30,18,9,17,17,2,121,113.282595,113.282595,106.812525,100.633288,3+,130,"Andrew Wiggins,Draymond Green,Kevon Looney,Jor...",Sean Wright,"Mark Lindsay,Ray Acosta",1,1,1,0.552632,0.148319,0.111111,0.094737,0.516129,0.137987,0.2,0.268817
12,NBA 2021-2022 Regular Season,22100013,2021-10-20,Portland,23,26,36,36,,,,,,121,240,45,93,12,35,19,22,9,40,49,25,22,5,12,13,5,121,105.079957,105.079957,115.150408,118.005377,3+,-5.5,231.5,-6.5,234.0,-255,2022,-2022 Regular Season,"Norman Powell,Robert Covington,Jusuf Nurkic,CJ...",Sacramento,24,38,38,24,,,,,,124,240,42,92,17,41,23,29,7,36,43,24,22,6,10,10,4,124,105.079957,105.079957,118.005377,115.150408,3+,208,"Harrison Barnes,Maurice Harkless,Richaun Holme...",Sean Wright,"Nick Buchert,Phenizee Ransom",2,1,1,0.548387,0.112379,0.209302,0.204301,0.548913,0.087138,0.148936,0.25
11,NBA 2021-2022 Regular Season,22100012,2021-10-20,Phoenix,20,38,24,16,,,,,,98,240,36,87,14,37,12,17,11,34,45,23,18,9,18,18,3,98,99.68525,99.68525,98.309429,110.347318,3+,-6.5,224.5,-6.0,224.0,-240,2022,-2022 Regular Season,"Mikal Bridges,Jae Crowder,Deandre Ayton,Devin ...",Denver,26,25,34,25,,,,,,110,240,44,83,17,39,5,9,6,40,46,25,20,9,17,19,1,110,99.68525,99.68525,110.347318,98.309429,3+,190,"Michael Porter Jr.,Aaron Gordon,Nikola Jokic,W...",Leon Wood,"Kevin Cutler,Marc Davis",2,1,1,0.494253,0.160028,0.211538,0.137931,0.63253,0.179313,0.146341,0.060241
10,NBA 2021-2022 Regular Season,22100011,2021-10-20,Utah,27,27,29,24,,,,,,107,240,40,91,14,47,13,15,12,41,53,18,19,6,10,10,5,107,94.965313,94.965313,112.672718,90.559381,3+,-11.5,222.0,-13.5,221.5,-1428,2022,-2022 Regular Season,"Bojan Bogdanovic,Royce O'Neale,Rudy Gobert,Don...",Oklahoma City,18,24,21,23,,,,,,86,240,34,91,7,35,11,18,15,35,50,19,15,4,14,15,2,86,94.965313,94.965313,90.559381,112.672718,3+,790,"Luguentz Dort,Darius Bazley,Derrick Favors,Jos...",Zach Zarba,"Mark Lindsay,Ray Acosta",2,1,1,0.516484,0.092937,0.26087,0.142857,0.412088,0.131671,0.267857,0.120879


In [14]:
def add_game_outcome(df):
    """
    Add columns indicating the winning and losing teams for each game in an NBA dataset.

    This function uses the home and road team points to determine the winner and loser of each game.
    It adds two new columns, 'winner' and 'loser', to the dataset.

    Parameters:
    df (pd.DataFrame): A DataFrame containing NBA game statistics, including home and road team points.

    Returns:
    pd.DataFrame: The original DataFrame with two new columns 'winner' and 'loser'.
    """

    # Determine the winning and losing teams based on points
    df["winner"] = df.apply(
        lambda x: x["home_team"] if x["home_pts"] > x["road_pts"] else x["road_team"],
        axis=1,
    )
    df["loser"] = df.apply(
        lambda x: x["road_team"] if x["home_pts"] > x["road_pts"] else x["home_team"],
        axis=1,
    )

    return df

In [15]:
df_5 = add_game_outcome(df_4)

In [16]:
df_5[["home_team", "road_team", "home_pts", "road_pts", "winner", "loser"]].head()

Unnamed: 0,home_team,road_team,home_pts,road_pts,winner,loser
0,Milwaukee,Brooklyn,127,104,Milwaukee,Brooklyn
1,LA Lakers,Golden State,114,121,Golden State,LA Lakers
12,Portland,Sacramento,121,124,Sacramento,Portland
11,Phoenix,Denver,98,110,Denver,Phoenix
10,Utah,Oklahoma City,107,86,Utah,Oklahoma City


In [17]:
def add_win_loss_info(df):
    """
    Add cumulative and last two weeks' statistics for wins, losses,
    and winning percentage for both home and road teams in an NBA dataset.
    """
    # Initialization
    wins, losses = {}, {}

    # Define the columns to be added
    columns = [
        "home_wins",
        "home_losses",
        "home_win_pct",
        "road_wins",
        "road_losses",
        "road_win_pct",
        "home_wins_l2w",
        "home_losses_l2w",
        "home_win_pct_l2w",
        "road_wins_l2w",
        "road_losses_l2w",
        "road_win_pct_l2w",
    ]
    for col in columns:
        df[col] = 0

    # Iterate through the DataFrame
    for index, row in df.iterrows():
        date = pd.to_datetime(row["date"])
        home_team, road_team = row["home_team"], row["road_team"]
        home_win = row["winner"] == home_team

        # Update cumulative stats
        df.at[index, "home_wins"] = wins.get(home_team, 0)
        df.at[index, "home_losses"] = losses.get(home_team, 0)
        df.at[index, "road_wins"] = wins.get(road_team, 0)
        df.at[index, "road_losses"] = losses.get(road_team, 0)

        # Update cumulative winning percentages
        if wins.get(home_team, 0) + losses.get(home_team, 0) > 0:
            df.at[index, "home_win_pct"] = wins[home_team] / (
                wins[home_team] + losses[home_team]
            )
        if wins.get(road_team, 0) + losses.get(road_team, 0) > 0:
            df.at[index, "road_win_pct"] = wins[road_team] / (
                wins[road_team] + losses[road_team]
            )

        # Filter for last two weeks' games
        l2w_start_date = date - timedelta(days=14)
        l2w_games = df[(df["date"] >= l2w_start_date) & (df["date"] < date)]

        # Calculate last two weeks' wins and losses
        l2w_home_wins = len(
            l2w_games[
                (l2w_games["home_team"] == home_team)
                & (l2w_games["winner"] == home_team)
            ]
        )
        l2w_home_losses = len(
            l2w_games[
                (l2w_games["home_team"] == home_team)
                & (l2w_games["winner"] != home_team)
            ]
        )
        l2w_road_wins = len(
            l2w_games[
                (l2w_games["road_team"] == road_team)
                & (l2w_games["winner"] == road_team)
            ]
        )
        l2w_road_losses = len(
            l2w_games[
                (l2w_games["road_team"] == road_team)
                & (l2w_games["winner"] != road_team)
            ]
        )

        # Update last two weeks' stats in the DataFrame
        df.at[index, "home_wins_l2w"] = l2w_home_wins
        df.at[index, "home_losses_l2w"] = l2w_home_losses
        df.at[index, "road_wins_l2w"] = l2w_road_wins
        df.at[index, "road_losses_l2w"] = l2w_road_losses
        df.at[index, "home_win_pct_l2w"] = (
            l2w_home_wins / (l2w_home_wins + l2w_home_losses)
            if l2w_home_wins + l2w_home_losses > 0
            else 0
        )
        df.at[index, "road_win_pct_l2w"] = (
            l2w_road_wins / (l2w_road_wins + l2w_road_losses)
            if l2w_road_wins + l2w_road_losses > 0
            else 0
        )

        # Update cumulative wins and losses for next iteration
        if home_team not in wins:
            wins[home_team] = 0
        if home_team not in losses:
            losses[home_team] = 0
        if road_team not in wins:
            wins[road_team] = 0
        if road_team not in losses:
            losses[road_team] = 0

        wins[home_team] += int(home_win)
        losses[home_team] += int(not home_win)
        wins[road_team] += int(not home_win)
        losses[road_team] += int(home_win)

    return df

In [18]:
df_6 = add_win_loss_info(df_5)

In [19]:
df_6.head()

Unnamed: 0,bigdataball_dataset,game_id,date,home_team,home_1q,home_2q,home_3q,home_4q,home_ot1,home_ot2,home_ot3,home_ot4,home_ot5,home_f,home_min,home_fg,home_fga,home_3p,home_3pa,home_ft,home_fta,home_or,home_dr,home_tot,home_a,home_pf,home_st,home_to,home_to_to,home_bl,home_pts,home_poss,home_pace,home_oeff,home_deff,home_team_rest_days,home_opening_spread,opening_total,home_closing_spread,closing_total,home_moneyline,season,season_type,home_starting_lineup,road_team,road_1q,road_2q,road_3q,road_4q,road_ot1,...,road_fg,road_fga,road_3p,road_3pa,road_ft,road_fta,road_or,road_dr,road_tot,road_a,road_pf,road_st,road_to,road_to_to,road_bl,road_pts,road_poss,road_pace,road_oeff,road_deff,road_team_rest_days,road_moneyline,road_starting_lineup,main_ref,crew,day_of_season,home_team_game_num,road_team_game_num,home_eFG%,home_TOV%,home_ORB%,home_FT%,road_eFG%,road_TOV%,road_ORB%,road_FT%,winner,loser,home_wins,home_losses,home_win_pct,road_wins,road_losses,road_win_pct,home_wins_l2w,home_losses_l2w,home_win_pct_l2w,road_wins_l2w,road_losses_l2w,road_win_pct_l2w
0,NBA 2021-2022 Regular Season,22100001,2021-10-19,Milwaukee,37,29,31,30,,,,,,127,240,48,105,17,45,14,18,13,41,54,25,19,8,7,8,9,127,102.843098,102.843098,123.489085,101.12492,3+,-1.5,240.5,-2.0,234.0,-126,2022,-2022 Regular Season,"Khris Middleton,Giannis Antetokounmpo,Brook Lo...",Brooklyn,25,34,26,19,,...,37,84,17,32,13,23,5,39,44,19,17,3,12,13,9,104,102.843098,102.843098,101.12492,123.489085,3+,105,"Kevin Durant,Blake Griffin,Nic Claxton,Joe Har...",Josh Tiven,"Jacyn Goble,Natalie Sago",1,1,1,0.538095,0.066159,0.245283,0.133333,0.541667,0.121359,0.108696,0.154762,Milwaukee,Brooklyn,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0
1,NBA 2021-2022 Regular Season,22100002,2021-10-19,LA Lakers,34,25,26,29,,,,,,114,240,45,95,15,42,9,19,5,40,45,21,25,7,17,18,4,114,113.282595,113.282595,100.633288,106.812525,3+,-5.5,230.5,-3.0,226.5,-154,2022,-2022 Regular Season,"LeBron James,Anthony Davis,DeAndre Jordan,Kent...",Golden State,32,21,30,38,,...,41,93,14,39,25,30,9,41,50,30,18,9,17,17,2,121,113.282595,113.282595,106.812525,100.633288,3+,130,"Andrew Wiggins,Draymond Green,Kevon Looney,Jor...",Sean Wright,"Mark Lindsay,Ray Acosta",1,1,1,0.552632,0.148319,0.111111,0.094737,0.516129,0.137987,0.2,0.268817,Golden State,LA Lakers,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0
12,NBA 2021-2022 Regular Season,22100013,2021-10-20,Portland,23,26,36,36,,,,,,121,240,45,93,12,35,19,22,9,40,49,25,22,5,12,13,5,121,105.079957,105.079957,115.150408,118.005377,3+,-5.5,231.5,-6.5,234.0,-255,2022,-2022 Regular Season,"Norman Powell,Robert Covington,Jusuf Nurkic,CJ...",Sacramento,24,38,38,24,,...,42,92,17,41,23,29,7,36,43,24,22,6,10,10,4,124,105.079957,105.079957,118.005377,115.150408,3+,208,"Harrison Barnes,Maurice Harkless,Richaun Holme...",Sean Wright,"Nick Buchert,Phenizee Ransom",2,1,1,0.548387,0.112379,0.209302,0.204301,0.548913,0.087138,0.148936,0.25,Sacramento,Portland,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0
11,NBA 2021-2022 Regular Season,22100012,2021-10-20,Phoenix,20,38,24,16,,,,,,98,240,36,87,14,37,12,17,11,34,45,23,18,9,18,18,3,98,99.68525,99.68525,98.309429,110.347318,3+,-6.5,224.5,-6.0,224.0,-240,2022,-2022 Regular Season,"Mikal Bridges,Jae Crowder,Deandre Ayton,Devin ...",Denver,26,25,34,25,,...,44,83,17,39,5,9,6,40,46,25,20,9,17,19,1,110,99.68525,99.68525,110.347318,98.309429,3+,190,"Michael Porter Jr.,Aaron Gordon,Nikola Jokic,W...",Leon Wood,"Kevin Cutler,Marc Davis",2,1,1,0.494253,0.160028,0.211538,0.137931,0.63253,0.179313,0.146341,0.060241,Denver,Phoenix,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0
10,NBA 2021-2022 Regular Season,22100011,2021-10-20,Utah,27,27,29,24,,,,,,107,240,40,91,14,47,13,15,12,41,53,18,19,6,10,10,5,107,94.965313,94.965313,112.672718,90.559381,3+,-11.5,222.0,-13.5,221.5,-1428,2022,-2022 Regular Season,"Bojan Bogdanovic,Royce O'Neale,Rudy Gobert,Don...",Oklahoma City,18,24,21,23,,...,34,91,7,35,11,18,15,35,50,19,15,4,14,15,2,86,94.965313,94.965313,90.559381,112.672718,3+,790,"Luguentz Dort,Darius Bazley,Derrick Favors,Jos...",Zach Zarba,"Mark Lindsay,Ray Acosta",2,1,1,0.516484,0.092937,0.26087,0.142857,0.412088,0.131671,0.267857,0.120879,Utah,Oklahoma City,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0


In [20]:
def calculate_stat_average(df, current_date, team, stat, last_2_weeks=False):
    """
    Calculate the average of a given statistic for a team up to a specified date.
    If last_2_weeks is True, the calculation is restricted to the last two weeks.

    :param df: DataFrame containing the NBA data.
    :param current_date: The date of the current game.
    :param team: The team for which the statistic is calculated.
    :param stat: The statistic abbreviation (e.g., 'pts' for points).
    :param last_2_weeks: Boolean, if True, calculate the average for the last two weeks.
    :return: Average of the specified statistic.
    """
    # Filter for games involving the team before the current date
    relevant_games = df[
        (df["date"] < current_date)
        & ((df["home_team"] == team) | (df["road_team"] == team))
    ]

    # Consider only the last 2 weeks if required
    if last_2_weeks:
        two_weeks_ago = current_date - timedelta(days=14)
        relevant_games = relevant_games[relevant_games["date"] >= two_weeks_ago]

    # Calculate the total of the statistic
    total_stat = relevant_games.apply(
        lambda row: row[f"home_{stat}"]
        if row["home_team"] == team
        else row[f"road_{stat}"],
        axis=1,
    ).sum()

    # Calculate the average
    num_games = len(relevant_games)
    return total_stat / num_games if num_games != 0 else 0


def add_stats_columns(df, stats):
    """
    Adds columns for multiple statistics. For each stat, it creates columns for home_avg_stat,
    road_avg_stat, home_avg_stat_l2w, and road_avg_stat_l2w.

    :param df: DataFrame containing the NBA data.
    :param stats: List of statistic abbreviations (e.g., ['pts', 'or']).
    :return: DataFrame with added columns for each statistic.
    """

    def apply_stats(row):
        date = row["date"]
        home_team = row["home_team"]
        road_team = row["road_team"]

        for stat in stats:
            # Define new column names
            home_avg_stat = f"home_avg_{stat}"
            road_avg_stat = f"road_avg_{stat}"
            home_avg_l2w = f"home_avg_{stat}_l2w"
            road_avg_l2w = f"road_avg_{stat}_l2w"

            # Calculate and assign the averages
            row[home_avg_stat] = calculate_stat_average(df, date, home_team, stat)
            row[road_avg_stat] = calculate_stat_average(df, date, road_team, stat)
            row[home_avg_l2w] = calculate_stat_average(
                df, date, home_team, stat, last_2_weeks=True
            )
            row[road_avg_l2w] = calculate_stat_average(
                df, date, road_team, stat, last_2_weeks=True
            )

        return row

    # Apply the function to each row of the DataFrame
    return df.apply(apply_stats, axis=1)

In [21]:
stat_list = [
    "1q",
    "2q",
    "3q",
    "4q",
    "ot1",
    "ot2",
    "ot3",
    "ot4",
    "ot5",
    "f",
    "min",
    "fg",
    "fga",
    "3p",
    "3pa",
    "ft",
    "fta",
    "or",
    "dr",
    "tot",
    "a",
    "pf",
    "st",
    "to",
    "to_to",
    "bl",
    "pts",
    "poss",
    "pace",
    "oeff",
    "deff",
    "eFG%",
    "TOV%",
    "ORB%",
    "FT%",
]

df_7 = add_stats_columns(df_6, stat_list)

In [22]:
df_7.head()

Unnamed: 0,bigdataball_dataset,game_id,date,home_team,home_1q,home_2q,home_3q,home_4q,home_ot1,home_ot2,home_ot3,home_ot4,home_ot5,home_f,home_min,home_fg,home_fga,home_3p,home_3pa,home_ft,home_fta,home_or,home_dr,home_tot,home_a,home_pf,home_st,home_to,home_to_to,home_bl,home_pts,home_poss,home_pace,home_oeff,home_deff,home_team_rest_days,home_opening_spread,opening_total,home_closing_spread,closing_total,home_moneyline,season,season_type,home_starting_lineup,road_team,road_1q,road_2q,road_3q,road_4q,road_ot1,...,home_avg_st_l2w,road_avg_st_l2w,home_avg_to,road_avg_to,home_avg_to_l2w,road_avg_to_l2w,home_avg_to_to,road_avg_to_to,home_avg_to_to_l2w,road_avg_to_to_l2w,home_avg_bl,road_avg_bl,home_avg_bl_l2w,road_avg_bl_l2w,home_avg_pts,road_avg_pts,home_avg_pts_l2w,road_avg_pts_l2w,home_avg_poss,road_avg_poss,home_avg_poss_l2w,road_avg_poss_l2w,home_avg_pace,road_avg_pace,home_avg_pace_l2w,road_avg_pace_l2w,home_avg_oeff,road_avg_oeff,home_avg_oeff_l2w,road_avg_oeff_l2w,home_avg_deff,road_avg_deff,home_avg_deff_l2w,road_avg_deff_l2w,home_avg_eFG%,road_avg_eFG%,home_avg_eFG%_l2w,road_avg_eFG%_l2w,home_avg_TOV%,road_avg_TOV%,home_avg_TOV%_l2w,road_avg_TOV%_l2w,home_avg_ORB%,road_avg_ORB%,home_avg_ORB%_l2w,road_avg_ORB%_l2w,home_avg_FT%,road_avg_FT%,home_avg_FT%_l2w,road_avg_FT%_l2w
0,NBA 2021-2022 Regular Season,22100001,2021-10-19,Milwaukee,37,29,31,30,,,,,,127,240,48,105,17,45,14,18,13,41,54,25,19,8,7,8,9,127,102.843098,102.843098,123.489085,101.12492,3+,-1.5,240.5,-2.0,234.0,-126,2022,-2022 Regular Season,"Khris Middleton,Giannis Antetokounmpo,Brook Lo...",Brooklyn,25,34,26,19,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,NBA 2021-2022 Regular Season,22100002,2021-10-19,LA Lakers,34,25,26,29,,,,,,114,240,45,95,15,42,9,19,5,40,45,21,25,7,17,18,4,114,113.282595,113.282595,100.633288,106.812525,3+,-5.5,230.5,-3.0,226.5,-154,2022,-2022 Regular Season,"LeBron James,Anthony Davis,DeAndre Jordan,Kent...",Golden State,32,21,30,38,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,NBA 2021-2022 Regular Season,22100013,2021-10-20,Portland,23,26,36,36,,,,,,121,240,45,93,12,35,19,22,9,40,49,25,22,5,12,13,5,121,105.079957,105.079957,115.150408,118.005377,3+,-5.5,231.5,-6.5,234.0,-255,2022,-2022 Regular Season,"Norman Powell,Robert Covington,Jusuf Nurkic,CJ...",Sacramento,24,38,38,24,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,NBA 2021-2022 Regular Season,22100012,2021-10-20,Phoenix,20,38,24,16,,,,,,98,240,36,87,14,37,12,17,11,34,45,23,18,9,18,18,3,98,99.68525,99.68525,98.309429,110.347318,3+,-6.5,224.5,-6.0,224.0,-240,2022,-2022 Regular Season,"Mikal Bridges,Jae Crowder,Deandre Ayton,Devin ...",Denver,26,25,34,25,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,NBA 2021-2022 Regular Season,22100011,2021-10-20,Utah,27,27,29,24,,,,,,107,240,40,91,14,47,13,15,12,41,53,18,19,6,10,10,5,107,94.965313,94.965313,112.672718,90.559381,3+,-11.5,222.0,-13.5,221.5,-1428,2022,-2022 Regular Season,"Bojan Bogdanovic,Royce O'Neale,Rudy Gobert,Don...",Oklahoma City,18,24,21,23,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
def calculate_pts_allowed_average(df, current_date, team, last_2_weeks=False):
    """
    Calculate the average points allowed by a team up to a specified date.
    Points allowed are the points scored by the opposing team.

    :param df: DataFrame containing the NBA data.
    :param current_date: The date of the current game.
    :param team: The team for which the points allowed is calculated.
    :param last_2_weeks: Boolean, if True, calculate the average for the last two weeks.
    :return: Average points allowed.
    """
    # Filter for games involving the team before the current date
    relevant_games = df[
        (df["date"] < current_date)
        & ((df["home_team"] == team) | (df["road_team"] == team))
    ]

    # Consider only the last 2 weeks if required
    if last_2_weeks:
        two_weeks_ago = current_date - timedelta(days=14)
        relevant_games = relevant_games[relevant_games["date"] >= two_weeks_ago]

    # Calculate the total points allowed by the team
    total_points_allowed = relevant_games.apply(
        lambda row: row["road_pts"] if row["home_team"] == team else row["home_pts"],
        axis=1,
    ).sum()

    # Calculate the average points allowed
    num_games = len(relevant_games)
    return total_points_allowed / num_games if num_games != 0 else 0


def add_pts_allowed_columns(df):
    """
    Adds columns for average points allowed for home and road teams, both overall and for the last two weeks.

    :param df: DataFrame containing the NBA data.
    :return: DataFrame with added columns for points allowed.
    """

    def apply_pts_allowed(row):
        date = row["date"]
        home_team = row["home_team"]
        road_team = row["road_team"]

        # Define new column names
        home_avg_pts_allowed = "home_avg_pts_allowed"
        road_avg_pts_allowed = "road_avg_pts_allowed"
        home_avg_pts_allowed_l2w = "home_avg_pts_allowed_l2w"
        road_avg_pts_allowed_l2w = "road_avg_pts_allowed_l2w"

        # Calculate and assign the averages for points allowed
        row[home_avg_pts_allowed] = calculate_pts_allowed_average(df, date, home_team)
        row[road_avg_pts_allowed] = calculate_pts_allowed_average(df, date, road_team)
        row[home_avg_pts_allowed_l2w] = calculate_pts_allowed_average(
            df, date, home_team, last_2_weeks=True
        )
        row[road_avg_pts_allowed_l2w] = calculate_pts_allowed_average(
            df, date, road_team, last_2_weeks=True
        )

        return row

    # Apply the function to each row of the DataFrame
    return df.apply(apply_pts_allowed, axis=1)

In [24]:
df_8 = add_pts_allowed_columns(df_7)

In [25]:
df_8.head()

Unnamed: 0,bigdataball_dataset,game_id,date,home_team,home_1q,home_2q,home_3q,home_4q,home_ot1,home_ot2,home_ot3,home_ot4,home_ot5,home_f,home_min,home_fg,home_fga,home_3p,home_3pa,home_ft,home_fta,home_or,home_dr,home_tot,home_a,home_pf,home_st,home_to,home_to_to,home_bl,home_pts,home_poss,home_pace,home_oeff,home_deff,home_team_rest_days,home_opening_spread,opening_total,home_closing_spread,closing_total,home_moneyline,season,season_type,home_starting_lineup,road_team,road_1q,road_2q,road_3q,road_4q,road_ot1,...,home_avg_to_l2w,road_avg_to_l2w,home_avg_to_to,road_avg_to_to,home_avg_to_to_l2w,road_avg_to_to_l2w,home_avg_bl,road_avg_bl,home_avg_bl_l2w,road_avg_bl_l2w,home_avg_pts,road_avg_pts,home_avg_pts_l2w,road_avg_pts_l2w,home_avg_poss,road_avg_poss,home_avg_poss_l2w,road_avg_poss_l2w,home_avg_pace,road_avg_pace,home_avg_pace_l2w,road_avg_pace_l2w,home_avg_oeff,road_avg_oeff,home_avg_oeff_l2w,road_avg_oeff_l2w,home_avg_deff,road_avg_deff,home_avg_deff_l2w,road_avg_deff_l2w,home_avg_eFG%,road_avg_eFG%,home_avg_eFG%_l2w,road_avg_eFG%_l2w,home_avg_TOV%,road_avg_TOV%,home_avg_TOV%_l2w,road_avg_TOV%_l2w,home_avg_ORB%,road_avg_ORB%,home_avg_ORB%_l2w,road_avg_ORB%_l2w,home_avg_FT%,road_avg_FT%,home_avg_FT%_l2w,road_avg_FT%_l2w,home_avg_pts_allowed,road_avg_pts_allowed,home_avg_pts_allowed_l2w,road_avg_pts_allowed_l2w
0,NBA 2021-2022 Regular Season,22100001,2021-10-19,Milwaukee,37,29,31,30,,,,,,127,240,48,105,17,45,14,18,13,41,54,25,19,8,7,8,9,127,102.843098,102.843098,123.489085,101.12492,3+,-1.5,240.5,-2.0,234.0,-126,2022,-2022 Regular Season,"Khris Middleton,Giannis Antetokounmpo,Brook Lo...",Brooklyn,25,34,26,19,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,NBA 2021-2022 Regular Season,22100002,2021-10-19,LA Lakers,34,25,26,29,,,,,,114,240,45,95,15,42,9,19,5,40,45,21,25,7,17,18,4,114,113.282595,113.282595,100.633288,106.812525,3+,-5.5,230.5,-3.0,226.5,-154,2022,-2022 Regular Season,"LeBron James,Anthony Davis,DeAndre Jordan,Kent...",Golden State,32,21,30,38,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,NBA 2021-2022 Regular Season,22100013,2021-10-20,Portland,23,26,36,36,,,,,,121,240,45,93,12,35,19,22,9,40,49,25,22,5,12,13,5,121,105.079957,105.079957,115.150408,118.005377,3+,-5.5,231.5,-6.5,234.0,-255,2022,-2022 Regular Season,"Norman Powell,Robert Covington,Jusuf Nurkic,CJ...",Sacramento,24,38,38,24,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,NBA 2021-2022 Regular Season,22100012,2021-10-20,Phoenix,20,38,24,16,,,,,,98,240,36,87,14,37,12,17,11,34,45,23,18,9,18,18,3,98,99.68525,99.68525,98.309429,110.347318,3+,-6.5,224.5,-6.0,224.0,-240,2022,-2022 Regular Season,"Mikal Bridges,Jae Crowder,Deandre Ayton,Devin ...",Denver,26,25,34,25,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,NBA 2021-2022 Regular Season,22100011,2021-10-20,Utah,27,27,29,24,,,,,,107,240,40,91,14,47,13,15,12,41,53,18,19,6,10,10,5,107,94.965313,94.965313,112.672718,90.559381,3+,-11.5,222.0,-13.5,221.5,-1428,2022,-2022 Regular Season,"Bojan Bogdanovic,Royce O'Neale,Rudy Gobert,Don...",Oklahoma City,18,24,21,23,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
def add_targets(df):
    """
    Add various target columns to an NBA dataset for betting and game outcome analysis.

    This function calculates several targets based on game statistics and betting lines.

    Parameters:
    df (pd.DataFrame): DataFrame containing NBA game statistics, including points, spreads, and totals.

    Returns:
    pd.DataFrame: The original DataFrame with added target columns for analysis.

    The function adds the following columns:
    - REG_TARGET: Point differential of the game (home points minus road points).
    - CLS_TARGET: Boolean indicating if the home team beat the opening spread.
    - CLS_TARGET_closing_spread: Boolean indicating if the home team beat the closing spread.
    - REG_TARGET_OU: Total points scored in the game (home points plus road points).
    - CLS_TARGET_OU_OPEN: Boolean indicating if the total points exceeded the opening total.
    - CLS_TARGET_OU_CLOSE: Boolean indicating if the total points exceeded the closing total.
    """

    # REG_TARGET: Point differential (home points - road points)
    # It represents the margin of victory or defeat for the home team.
    df["REG_TARGET"] = df["home_pts"] - df["road_pts"]

    # CLS_TARGET: Boolean indicating if the home team covered the opening spread.
    # True if home team's win margin is greater than the negative of the opening spread.
    # It's used to determine if the home team performed better than the pre-game expectations.
    df["CLS_TARGET"] = df["REG_TARGET"] > -df["home_opening_spread"]

    # CLS_TARGET_closing_spread: Similar to CLS_TARGET but using the closing spread.
    # It reflects the home team's performance against the final betting line before the game.
    df["CLS_TARGET_closing_spread"] = df["REG_TARGET"] > -df["home_closing_spread"]

    # REG_TARGET_OU: Sum of home and road points, indicating total points scored in the game.
    df["REG_TARGET_OU"] = df["home_pts"] + df["road_pts"]

    # CLS_TARGET_OU_OPEN: Boolean indicating if total points scored exceeded the opening total line.
    # It shows whether the game was higher-scoring than initially expected by bookmakers.
    df["CLS_TARGET_OU_OPEN"] = df["REG_TARGET_OU"] > df["opening_total"]

    # CLS_TARGET_OU_CLOSE: Similar to CLS_TARGET_OU_OPEN but with the closing total line.
    # It reflects whether the game's total score surpassed the final total points line set before the game.
    df["CLS_TARGET_OU_CLOSE"] = df["REG_TARGET_OU"] > df["closing_total"]

    return df

In [27]:
df_9 = add_targets(df_8)

  df["REG_TARGET"] = df["home_pts"] - df["road_pts"]
  df["CLS_TARGET"] = df["REG_TARGET"] > -df["home_opening_spread"]
  df["CLS_TARGET_closing_spread"] = df["REG_TARGET"] > -df["home_closing_spread"]
  df["REG_TARGET_OU"] = df["home_pts"] + df["road_pts"]
  df["CLS_TARGET_OU_OPEN"] = df["REG_TARGET_OU"] > df["opening_total"]
  df["CLS_TARGET_OU_CLOSE"] = df["REG_TARGET_OU"] > df["closing_total"]


In [28]:
df_9.head()

Unnamed: 0,bigdataball_dataset,game_id,date,home_team,home_1q,home_2q,home_3q,home_4q,home_ot1,home_ot2,home_ot3,home_ot4,home_ot5,home_f,home_min,home_fg,home_fga,home_3p,home_3pa,home_ft,home_fta,home_or,home_dr,home_tot,home_a,home_pf,home_st,home_to,home_to_to,home_bl,home_pts,home_poss,home_pace,home_oeff,home_deff,home_team_rest_days,home_opening_spread,opening_total,home_closing_spread,closing_total,home_moneyline,season,season_type,home_starting_lineup,road_team,road_1q,road_2q,road_3q,road_4q,road_ot1,...,home_avg_bl,road_avg_bl,home_avg_bl_l2w,road_avg_bl_l2w,home_avg_pts,road_avg_pts,home_avg_pts_l2w,road_avg_pts_l2w,home_avg_poss,road_avg_poss,home_avg_poss_l2w,road_avg_poss_l2w,home_avg_pace,road_avg_pace,home_avg_pace_l2w,road_avg_pace_l2w,home_avg_oeff,road_avg_oeff,home_avg_oeff_l2w,road_avg_oeff_l2w,home_avg_deff,road_avg_deff,home_avg_deff_l2w,road_avg_deff_l2w,home_avg_eFG%,road_avg_eFG%,home_avg_eFG%_l2w,road_avg_eFG%_l2w,home_avg_TOV%,road_avg_TOV%,home_avg_TOV%_l2w,road_avg_TOV%_l2w,home_avg_ORB%,road_avg_ORB%,home_avg_ORB%_l2w,road_avg_ORB%_l2w,home_avg_FT%,road_avg_FT%,home_avg_FT%_l2w,road_avg_FT%_l2w,home_avg_pts_allowed,road_avg_pts_allowed,home_avg_pts_allowed_l2w,road_avg_pts_allowed_l2w,REG_TARGET,CLS_TARGET,CLS_TARGET_closing_spread,REG_TARGET_OU,CLS_TARGET_OU_OPEN,CLS_TARGET_OU_CLOSE
0,NBA 2021-2022 Regular Season,22100001,2021-10-19,Milwaukee,37,29,31,30,,,,,,127,240,48,105,17,45,14,18,13,41,54,25,19,8,7,8,9,127,102.843098,102.843098,123.489085,101.12492,3+,-1.5,240.5,-2.0,234.0,-126,2022,-2022 Regular Season,"Khris Middleton,Giannis Antetokounmpo,Brook Lo...",Brooklyn,25,34,26,19,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23,True,True,231,False,False
1,NBA 2021-2022 Regular Season,22100002,2021-10-19,LA Lakers,34,25,26,29,,,,,,114,240,45,95,15,42,9,19,5,40,45,21,25,7,17,18,4,114,113.282595,113.282595,100.633288,106.812525,3+,-5.5,230.5,-3.0,226.5,-154,2022,-2022 Regular Season,"LeBron James,Anthony Davis,DeAndre Jordan,Kent...",Golden State,32,21,30,38,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-7,False,False,235,True,True
12,NBA 2021-2022 Regular Season,22100013,2021-10-20,Portland,23,26,36,36,,,,,,121,240,45,93,12,35,19,22,9,40,49,25,22,5,12,13,5,121,105.079957,105.079957,115.150408,118.005377,3+,-5.5,231.5,-6.5,234.0,-255,2022,-2022 Regular Season,"Norman Powell,Robert Covington,Jusuf Nurkic,CJ...",Sacramento,24,38,38,24,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3,False,False,245,True,True
11,NBA 2021-2022 Regular Season,22100012,2021-10-20,Phoenix,20,38,24,16,,,,,,98,240,36,87,14,37,12,17,11,34,45,23,18,9,18,18,3,98,99.68525,99.68525,98.309429,110.347318,3+,-6.5,224.5,-6.0,224.0,-240,2022,-2022 Regular Season,"Mikal Bridges,Jae Crowder,Deandre Ayton,Devin ...",Denver,26,25,34,25,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-12,False,False,208,False,False
10,NBA 2021-2022 Regular Season,22100011,2021-10-20,Utah,27,27,29,24,,,,,,107,240,40,91,14,47,13,15,12,41,53,18,19,6,10,10,5,107,94.965313,94.965313,112.672718,90.559381,3+,-11.5,222.0,-13.5,221.5,-1428,2022,-2022 Regular Season,"Bojan Bogdanovic,Royce O'Neale,Rudy Gobert,Don...",Oklahoma City,18,24,21,23,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21,True,True,193,False,False


In [29]:
df_9.to_csv("../data/nba_ai/cleaned_data_2021-2022.csv", index=False)