### Production Features Pipeline

In [1]:
# select web scraper; 'SCRAPINGANT' or 'SELENIUM'
# SCRAPINGANT requires a subscription but includes a proxy server

WEBSCRAPER = 'SCRAPINGANT'
# WEBSCRAPER = "SELENIUM"

In [2]:
import os

import pandas as pd
import numpy as np

import hopsworks

from datetime import datetime, timedelta
from pytz import timezone

import json

import time

from pathlib import Path  # for Windows/Linux compatibility

# change working directory to project root when running from notebooks folder to make it easier to import modules
# and to access sibling folders
os.chdir("..")


from src.utils.webscraping import (
    get_new_games,
    activate_web_driver,
    get_todays_matchups,
)

from src.data.cleaning import (
    process_games,
    add_TARGET,
)

from src.data.build_features import (
    process_features,
)

from src.utils.hopsworks_utils import (
    save_feature_names,
    convert_feature_names,
)

from src.utils.constants import (
    FEATURE_GROUP_VERSION,
)

DATAPATH = Path(r"data")

**Load API keys**

In [3]:
from dotenv import load_dotenv

load_dotenv()

try:
    HOPSWORKS_API_KEY = os.environ["HOPSWORKS_API_KEY"]
except:
    raise Exception("Set environment variable HOPSWORKS_API_KEY")

# if scrapingant is chosen then set the api key, otherwise load the selenium webdriver
if WEBSCRAPER == "SCRAPINGANT":
    try:
        SCRAPINGANT_API_KEY = os.environ["SCRAPINGANT_API_KEY"]
    except:
        raise Exception("Set environment variable SCRAPINGANT_API_KEY")
    driver = None

elif WEBSCRAPER == "SELENIUM":
    driver = activate_web_driver()
    SCRAPINGANT_API_KEY = ""

**Scrape New Completed Games and Format Them**

In [4]:
df_new = get_new_games(SCRAPINGANT_API_KEY, driver)

if df_new.empty:
    print("No new games to process")
else:

    # get the SEASON of the last game in the database
    # this will used when constructing rows for prediction
    SEASON = df_new["SEASON"].max()

    print(SEASON)
    print(df_new.head())

Current month is 03
Scraping https://www.nba.com/stats/teams/boxscores?SeasonType=Regular+Season&DateFrom=03/15/25&DateTo=03/22/25
USING SCARAPER API


2024
  GAME_DATE_EST  HOME_TEAM_WINS  PTS_home  FG_PCT_home  FG3_PCT_home  \
0    2025-03-21               1       128         54.0          40.0   
1    2025-03-21               1       128         57.7          50.0   
2    2025-03-21               1       123         52.2          45.2   
3    2025-03-21               0        99         45.2          40.0   
4    2025-03-21               1       123         52.5          35.3   

   FT_PCT_home  REB_home  AST_home  HOME_TEAM_ID   GAME_ID  PTS_away  \
0         80.0        38        30    1610612757  22401022       109   
1         80.8        43        29    1610612746  22401023       108   
2        100.0        46        33    1610612756  22401021       112   
3         75.0        38        24    1610612762  22401020       121   
4         75.0        44        25    1610612742  22401019       117   

   FG_PCT_away  FG3_PCT_away  FT_PCT_away  REB_away  AST_away  \
0         50.0          34.6         85.7        31        24   



**Retrieve todays games**

In [5]:
# retrieve list of teams playing today

# get today's games on NBA schedule
matchups, game_ids = get_todays_matchups(SCRAPINGANT_API_KEY, driver)

if matchups is None:
    print("No games today")
else:
    print(matchups)
    print(game_ids)

[['1610612751', '1610612754'], ['1610612744', '1610612737'], ['1610612764', '1610612752'], ['1610612749', '1610612758'], ['1610612741', '1610612747']]
['22401024', '22401025', '22401026', '22401027', '22401028']


**Close Webdriver**

In [6]:
if WEBSCRAPER == "SELENIUM":
    driver.close()

**Check if anything is going on in the season**

In [7]:
UPDATE_WF = True

if (df_new.empty) and (matchups is None):
    print("No new games to process")

    UPDATE_WF = False
    exit()

**Create Rows for Today's Games with Empty Stats**

In [8]:
# reformat today's matchups to the new games dataframe

if matchups is None:
    print("No games going on. Nothing to do.")
    exit()

else:

    df_today = df_new.drop(df_new.index)  # empty copy of df_new with same columns
    for i, matchup in enumerate(matchups):
        game_details = {
            "HOME_TEAM_ID": matchup[1],
            "VISITOR_TEAM_ID": matchup[0],
            "GAME_DATE_EST": datetime.now(timezone("EST")).strftime("%Y-%m-%d"),
            "GAME_ID": int(game_ids[i]),
            "SEASON": SEASON,
        }
        game_details_df = pd.DataFrame(game_details, index=[i])
        # append to new games dataframe
        df_today = pd.concat([df_today, game_details_df], ignore_index=True)

    # blank rows will be filled with 0 to prevent issues with feature engineering
    df_today = df_today.fillna(0)

    print(df_today)

  GAME_DATE_EST  HOME_TEAM_WINS  PTS_home  FG_PCT_home  FG3_PCT_home  \
0    2025-03-22             0.0       0.0          0.0           0.0   
1    2025-03-22             0.0       0.0          0.0           0.0   
2    2025-03-22             0.0       0.0          0.0           0.0   
3    2025-03-22             0.0       0.0          0.0           0.0   
4    2025-03-22             0.0       0.0          0.0           0.0   

   FT_PCT_home  REB_home  AST_home HOME_TEAM_ID   GAME_ID  PTS_away  \
0          0.0       0.0       0.0   1610612754  22401024       0.0   
1          0.0       0.0       0.0   1610612737  22401025       0.0   
2          0.0       0.0       0.0   1610612752  22401026       0.0   
3          0.0       0.0       0.0   1610612758  22401027       0.0   
4          0.0       0.0       0.0   1610612747  22401028       0.0   

   FG_PCT_away  FG3_PCT_away  FT_PCT_away  REB_away  AST_away VISITOR_TEAM_ID  \
0          0.0           0.0          0.0       0.0       0

**Access Feature Store**

In [9]:
if UPDATE_WF:

    project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)

    # HOPSWORKS can be kinda buggy and has been throwing a lot of errors recently or even just failing to return data
    # so I'm adding a try/except block to retry the query if it fails
    tries = 5

    for i in range(tries):

        try:
            fs = project.get_feature_store()
        except KeyError as e:
            if i < tries - 1:  # i is zero indexed
                time.sleep(30)
                continue
            else:
                raise ValueError("HOPSWORKS failed to connect")
        break

2025-03-22 12:19:41,700 INFO: Initializing external client


2025-03-22 12:19:41,700 INFO: Base URL: https://c.app.hopsworks.ai:443


2025-03-22 12:19:42,375 INFO: Python Engine initialized.



Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/90261


**Access Feature Group**

In [10]:
# HOPSWORKS can be kinda buggy and has been throwing a lot of errors recently or even just failing to return data
# so I'm adding a try/except block to retry the query if it fails
tries = 5

if UPDATE_WF:
    for i in range(tries):

        try:
            rolling_stats_fg = fs.get_feature_group(
                name="rolling_stats",
                version=FEATURE_GROUP_VERSION,
            )
        except KeyError as e:
            if i < tries - 1:  # i is zero indexed
                time.sleep(30)
                continue
            else:
                raise ValueError("HOPSWORKS failed to connect")
        break

**Query Old Data Needed for Feature Engineering of New Data**

To generate features like rolling averages for the new games, older data from previous games is needed since some of the rolling averages might extend back 15 or 20 games or so.

In [11]:
BASE_FEATURES = [
    "game_date_est",
    "game_id",
    "home_team_id",
    "visitor_team_id",
    "season",
    "pts_home",
    "fg_pct_home",
    "ft_pct_home",
    "fg3_pct_home",
    "ast_home",
    "reb_home",
    "pts_away",
    "fg_pct_away",
    "ft_pct_away",
    "fg3_pct_away",
    "ast_away",
    "reb_away",
    "home_team_wins",
]

if UPDATE_WF:
    ds_query = rolling_stats_fg.select(BASE_FEATURES)

    # HOPSWORKS can be kinda buggy and has been throwing a lot of errors recently or even just failing to return data
    # so I'm adding a try/except block to retry the query if it fails
    tries = 5

    for i in range(tries):
        for j in range(tries):
            try:
                df_old = ds_query.read()
            except KeyError as e:
                if j < tries - 1:
                    time.sleep(10)
                    continue
                else:
                    raise ValueError("HOPSWORKS failed to connect")
            break

        if df_old.empty:
            if i < tries - 1:
                time.sleep(10)
            else:
                raise ValueError("HOPSWORKS failed to return data")
        else:
            break

Reading data from Hopsworks, using Hopsworks Feature Query Service.   

Reading data from Hopsworks, using Hopsworks Feature Query Service..   

Reading data from Hopsworks, using Hopsworks Feature Query Service...   

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.46s) 


**Convert Feature Names back to original mixed case**

In [12]:
# hopsworks converts all feature names to lowercase, and for code reuse, we need to convert them back
if UPDATE_WF:
    df_old = convert_feature_names(df_old)
    df_old
    df_old[df_old["PTS_home"] == 0]
    print(df_old)

                  GAME_DATE_EST   GAME_ID  HOME_TEAM_ID  VISITOR_TEAM_ID  \
0     2018-01-05 00:00:00+00:00  21700570    1610612738       1610612750   
1     2006-12-11 00:00:00+00:00  20600301    1610612753       1610612756   
2     2007-11-10 00:00:00+00:00  20700084    1610612758       1610612750   
3     2005-01-11 00:00:00+00:00  20400504    1610612760       1610612746   
4     2016-10-29 00:00:00+00:00  21600028    1610612752       1610612763   
...                         ...       ...           ...              ...   
26258 2025-01-23 00:00:00+00:00  22400983    1610612746       1610612764   
26259 2015-04-10 00:00:00+00:00  21401175    1610612753       1610612761   
26260 2005-01-15 00:00:00+00:00  20400537    1610612745       1610612759   
26261 2012-03-07 00:00:00+00:00  21100575    1610612749       1610612741   
26262 2009-01-23 00:00:00+00:00  20800635    1610612750       1610612740   

       SEASON  PTS_home  FG_PCT_home  FT_PCT_home  FG3_PCT_home  AST_home  \
0        2

**Update Yesterday's Matchup Predictions with New Final Results**

In [13]:
# filter out games that are pending final results
# (these were the rows used for prediction yesterday)
# and then update these with the new results


# one approach is to simply drop the rows that were used for prediction yesterday
# which are games that have 0 points for home team
# and then append the new rows to the dataframe
if UPDATE_WF:

    df_old = df_old[df_old["PTS_home"] != 0]
    df_old = pd.concat([df_old, df_new], ignore_index=True)
    print(df_old)

                   GAME_DATE_EST   GAME_ID  HOME_TEAM_ID  VISITOR_TEAM_ID  \
0      2018-01-05 00:00:00+00:00  21700570    1610612738       1610612750   
1      2006-12-11 00:00:00+00:00  20600301    1610612753       1610612756   
2      2007-11-10 00:00:00+00:00  20700084    1610612758       1610612750   
3      2005-01-11 00:00:00+00:00  20400504    1610612760       1610612746   
4      2016-10-29 00:00:00+00:00  21600028    1610612752       1610612763   
...                          ...       ...           ...              ...   
26278        2025-03-19 00:00:00  22401003    1610612759       1610612752   
26279        2025-03-19 00:00:00  22401001    1610612750       1610612740   
26280        2025-03-19 00:00:00  22401002    1610612760       1610612755   
26281        2025-03-19 00:00:00  22401000    1610612748       1610612765   
26282        2025-03-19 00:00:00  22400998    1610612754       1610612742   

       SEASON  PTS_home  FG_PCT_home  FT_PCT_home  FG3_PCT_home  AST_home  

**Add Today's Matchups for Feature Engineering**

In [14]:
if UPDATE_WF:

    if matchups is None:
        print("No games today")
        df_combined = df_old
    else:
        df_combined = pd.concat([df_old, df_today], ignore_index=True)
        print(df_combined)

                   GAME_DATE_EST   GAME_ID HOME_TEAM_ID VISITOR_TEAM_ID  \
0      2018-01-05 00:00:00+00:00  21700570   1610612738      1610612750   
1      2006-12-11 00:00:00+00:00  20600301   1610612753      1610612756   
2      2007-11-10 00:00:00+00:00  20700084   1610612758      1610612750   
3      2005-01-11 00:00:00+00:00  20400504   1610612760      1610612746   
4      2016-10-29 00:00:00+00:00  21600028   1610612752      1610612763   
...                          ...       ...          ...             ...   
26283                 2025-03-22  22401024   1610612754      1610612751   
26284                 2025-03-22  22401025   1610612737      1610612744   
26285                 2025-03-22  22401026   1610612752      1610612764   
26286                 2025-03-22  22401027   1610612758      1610612749   
26287                 2025-03-22  22401028   1610612747      1610612741   

       SEASON  PTS_home  FG_PCT_home  FT_PCT_home  FG3_PCT_home  AST_home  \
0        2017      91.

**Data Processing**

In [15]:
if UPDATE_WF:
    df_combined = process_games(df_combined)
    df_combined = add_TARGET(df_combined)
    print(df_combined)

                   GAME_DATE_EST   GAME_ID HOME_TEAM_ID VISITOR_TEAM_ID  \
0      2018-01-05 00:00:00+00:00  21700570   1610612738      1610612750   
1      2006-12-11 00:00:00+00:00  20600301   1610612753      1610612756   
2      2007-11-10 00:00:00+00:00  20700084   1610612758      1610612750   
3      2005-01-11 00:00:00+00:00  20400504   1610612760      1610612746   
4      2016-10-29 00:00:00+00:00  21600028   1610612752      1610612763   
...                          ...       ...          ...             ...   
26283                 2025-03-22  22401024   1610612754      1610612751   
26284                 2025-03-22  22401025   1610612737      1610612744   
26285                 2025-03-22  22401026   1610612752      1610612764   
26286                 2025-03-22  22401027   1610612758      1610612749   
26287                 2025-03-22  22401028   1610612747      1610612741   

       SEASON  PTS_home  FG_PCT_home  FT_PCT_home  FG3_PCT_home  AST_home  \
0        2017      91.

**Feature Engineering**

In [16]:
# Feature engineering to add:
# rolling averages of key stats,
# win/lose streaks,
# home/away streaks,
# specific matchup (team X vs team Y) rolling averages and streaks
if UPDATE_WF:
    df_combined = process_features(df_combined)

    # fix type conversion issues with hopsworks
    df_combined["TARGET"] = df_combined["TARGET"].astype("int16")
    df_combined["HOME_TEAM_WINS"] = df_combined["HOME_TEAM_WINS"].astype("int16")
    print(df_combined)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


['HOME_PTS_home_AVG_LAST_3_HOME', 'HOME_PTS_home_AVG_LAST_7_HOME', 'HOME_PTS_home_AVG_LAST_10_HOME', 'HOME_FG_PCT_home_AVG_LAST_3_HOME', 'HOME_FG_PCT_home_AVG_LAST_7_HOME', 'HOME_FG_PCT_home_AVG_LAST_10_HOME', 'HOME_FT_PCT_home_AVG_LAST_3_HOME', 'HOME_FT_PCT_home_AVG_LAST_7_HOME', 'HOME_FT_PCT_home_AVG_LAST_10_HOME', 'HOME_FG3_PCT_home_AVG_LAST_3_HOME', 'HOME_FG3_PCT_home_AVG_LAST_7_HOME', 'HOME_FG3_PCT_home_AVG_LAST_10_HOME', 'HOME_AST_home_AVG_LAST_3_HOME', 'HOME_AST_home_AVG_LAST_7_HOME', 'HOME_AST_home_AVG_LAST_10_HOME', 'HOME_REB_home_AVG_LAST_3_HOME', 'HOME_REB_home_AVG_LAST_7_HOME', 'HOME_REB_home_AVG_LAST_10_HOME', 'HOME_TEAM_ID', 'GAME_DATE_EST']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


['VISITOR_TEAM_WINS_AVG_LAST_3_VISITOR', 'VISITOR_TEAM_WINS_AVG_LAST_7_VISITOR', 'VISITOR_TEAM_WINS_AVG_LAST_10_VISITOR', 'VISITOR_PTS_away_AVG_LAST_3_VISITOR', 'VISITOR_PTS_away_AVG_LAST_7_VISITOR', 'VISITOR_PTS_away_AVG_LAST_10_VISITOR', 'VISITOR_FG_PCT_away_AVG_LAST_3_VISITOR', 'VISITOR_FG_PCT_away_AVG_LAST_7_VISITOR', 'VISITOR_FG_PCT_away_AVG_LAST_10_VISITOR', 'VISITOR_FT_PCT_away_AVG_LAST_3_VISITOR', 'VISITOR_FT_PCT_away_AVG_LAST_7_VISITOR', 'VISITOR_FT_PCT_away_AVG_LAST_10_VISITOR', 'VISITOR_FG3_PCT_away_AVG_LAST_3_VISITOR', 'VISITOR_FG3_PCT_away_AVG_LAST_7_VISITOR', 'VISITOR_FG3_PCT_away_AVG_LAST_10_VISITOR', 'VISITOR_AST_away_AVG_LAST_3_VISITOR', 'VISITOR_AST_away_AVG_LAST_7_VISITOR', 'VISITOR_AST_away_AVG_LAST_10_VISITOR', 'VISITOR_REB_away_AVG_LAST_3_VISITOR', 'VISITOR_REB_away_AVG_LAST_7_VISITOR', 'VISITOR_REB_away_AVG_LAST_10_VISITOR', 'VISITOR_TEAM_ID', 'GAME_DATE_EST']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


['PTS_AVG_LAST_3_ALL', 'PTS_AVG_LAST_7_ALL', 'PTS_AVG_LAST_10_ALL', 'PTS_AVG_LAST_15_ALL', 'FG_PCT_AVG_LAST_3_ALL', 'FG_PCT_AVG_LAST_7_ALL', 'FG_PCT_AVG_LAST_10_ALL', 'FG_PCT_AVG_LAST_15_ALL', 'FT_PCT_AVG_LAST_3_ALL', 'FT_PCT_AVG_LAST_7_ALL', 'FT_PCT_AVG_LAST_10_ALL', 'FT_PCT_AVG_LAST_15_ALL', 'FG3_PCT_AVG_LAST_3_ALL', 'FG3_PCT_AVG_LAST_7_ALL', 'FG3_PCT_AVG_LAST_10_ALL', 'FG3_PCT_AVG_LAST_15_ALL', 'AST_AVG_LAST_3_ALL', 'AST_AVG_LAST_7_ALL', 'AST_AVG_LAST_10_ALL', 'AST_AVG_LAST_15_ALL', 'REB_AVG_LAST_3_ALL', 'REB_AVG_LAST_7_ALL', 'REB_AVG_LAST_10_ALL', 'REB_AVG_LAST_15_ALL', 'TEAM1', 'GAME_DATE_EST']


      GAME_DATE_EST   GAME_ID  HOME_TEAM_ID  VISITOR_TEAM_ID  SEASON  \
0        2003-10-28  20300003    1610612747       1610612742    2003   
1        2003-10-28  20300001    1610612755       1610612748    2003   
2        2003-10-28  20300002    1610612759       1610612756    2003   
3        2003-10-29  20300006    1610612740       1610612737    2003   
4        2003-10-29  20300010    1610612741       1610612764    2003   
...             ...       ...           ...              ...     ...   
26273    2025-03-22  22401025    1610612737       1610612744    2024   
26274    2025-03-22  22401028    1610612747       1610612741    2024   
26275    2025-03-22  22401026    1610612752       1610612764    2024   
26276    2025-03-22  22401024    1610612754       1610612751    2024   
26277    2025-03-22  22401027    1610612758       1610612749    2024   

       PTS_home  FG_PCT_home  FT_PCT_home  FG3_PCT_home  AST_home  ...  \
0           109     0.505859     0.600098      0.350098      

**Insert New Data into Feature Group**

In [17]:
# HOPSWORKS can be kinda buggy and has been throwing a lot of errors recently or even just failing to return data
# so I'm adding a try/except block to retry the query if it fails

# overwriting feature group doesn't work for some reason
# temporary solution is to delete the FG andcreate it again with new data
tries = 5

if UPDATE_WF:
    for i in range(tries):

        try:
            rolling_stats_fg.delete()

            rolling_stats_fg = fs.create_feature_group(
                name="rolling_stats",
                version=FEATURE_GROUP_VERSION,
                description="Rolling averages and current win/lose streaks",
                primary_key=["GAME_DATE_EST", "HOME_TEAM_ID"],
                event_time="game_date_est",
            )

            rolling_stats_fg.insert(df_combined, write_options={"wait_for_job": False})

            # rolling_stats_fg.insert(
            #     df_combined, write_options={"wait_for_job": False}
            # )
        except KeyError as e:
            if i < tries - 1:
                time.sleep(30)
                continue
            else:
                raise ValueError("HOPSWORKS failed to connect")
        break





Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/90261/fs/90180/fg/1422834


Uploading Dataframe: 0.00% |          | Rows 0/26278 | Elapsed Time: 00:00 | Remaining Time: ?

Uploading Dataframe: 6.95% |▋         | Rows 1827/26278 | Elapsed Time: 00:01 | Remaining Time: 00:13

Uploading Dataframe: 13.96% |█▍        | Rows 3668/26278 | Elapsed Time: 00:02 | Remaining Time: 00:12

Uploading Dataframe: 21.38% |██▏       | Rows 5619/26278 | Elapsed Time: 00:03 | Remaining Time: 00:10

Uploading Dataframe: 28.86% |██▉       | Rows 7584/26278 | Elapsed Time: 00:04 | Remaining Time: 00:09

Uploading Dataframe: 36.22% |███▌      | Rows 9518/26278 | Elapsed Time: 00:05 | Remaining Time: 00:08

Uploading Dataframe: 43.65% |████▎     | Rows 11471/26278 | Elapsed Time: 00:06 | Remaining Time: 00:07

Uploading Dataframe: 51.06% |█████     | Rows 13418/26278 | Elapsed Time: 00:07 | Remaining Time: 00:06

Uploading Dataframe: 58.54% |█████▊    | Rows 15384/26278 | Elapsed Time: 00:08 | Remaining Time: 00:05

Uploading Dataframe: 65.95% |██████▌   | Rows 17330/26278 | Elapsed Time: 00:09 | Remaining Time: 00:04

Uploading Dataframe: 73.26% |███████▎  | Rows 19252/26278 | Elapsed Time: 00:10 | Remaining Time: 00:03

Uploading Dataframe: 80.63% |████████  | Rows 21189/26278 | Elapsed Time: 00:11 | Remaining Time: 00:02

Uploading Dataframe: 88.09% |████████▊ | Rows 23147/26278 | Elapsed Time: 00:12 | Remaining Time: 00:01

Uploading Dataframe: 95.44% |█████████▌| Rows 25081/26278 | Elapsed Time: 00:13 | Remaining Time: 00:00

Uploading Dataframe: 100.00% |██████████| Rows 26278/26278 | Elapsed Time: 00:13 | Remaining Time: 00:00




Launching job: rolling_stats_2_offline_fg_materialization


Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/90261/jobs/named/rolling_stats_2_offline_fg_materialization/executions
