### Production Features Pipeline

In [1]:
# select web scraper; 'SCRAPINGANT' or 'SELENIUM'
# SCRAPINGANT requires a subscription but includes a proxy server

WEBSCRAPER = 'SCRAPINGANT'
# WEBSCRAPER = 'SELENIUM'

In [2]:
import os

import pandas as pd
import numpy as np

import hopsworks

from datetime import datetime, timedelta
from pytz import timezone

import json

import time

from pathlib import Path  #for Windows/Linux compatibility

# change working directory to project root when running from notebooks folder to make it easier to import modules
# and to access sibling folders
os.chdir('..') 

 
from src.utils.webscraping import (
    get_new_games,
    activate_web_driver,
    get_todays_matchups,
)

from src.data.cleaning import (
    process_games,
    add_TARGET,
)

from src.data.build_features import (
    process_features,
)

from src.utils.hopsworks_utils import (
    save_feature_names,
    convert_feature_names,
)

from src.utils.constants import (
    FEATURE_GROUP_VERSION,
)

DATAPATH = Path(r'data')

**Load API keys**

In [3]:
from dotenv import load_dotenv

load_dotenv()

try:
    HOPSWORKS_API_KEY = os.environ['HOPSWORKS_API_KEY']
except:
    raise Exception('Set environment variable HOPSWORKS_API_KEY')

# if scrapingant is chosen then set the api key, otherwise load the selenium webdriver
if WEBSCRAPER == 'SCRAPINGANT':
    try:
        SCRAPINGANT_API_KEY = os.environ['SCRAPINGANT_API_KEY']
    except:
        raise Exception('Set environment variable SCRAPINGANT_API_KEY')
    driver = None
    
elif WEBSCRAPER == 'SELENIUM':
    driver = activate_web_driver('chromium')
    SCRAPINGANT_API_KEY = ""
    



**Scrape New Completed Games and Format Them**

In [4]:
df_new = get_new_games(SCRAPINGANT_API_KEY, driver)

if df_new.empty:
    print('No new games to process')
else:

    # get the SEASON of the last game in the database
    # this will used when constructing rows for prediction
    SEASON = df_new['SEASON'].max()

    df_new.head()

Current month is 01
Scraping https://www.nba.com/stats/teams/boxscores?SeasonType=Regular+Season&DateFrom=12/26/24&DateTo=01/02/25


No new games to process


**Retrieve todays games**

In [5]:
#retrieve list of teams playing today

# get today's games on NBA schedule
matchups, game_ids = get_todays_matchups(SCRAPINGANT_API_KEY, driver)

if matchups is None:
    print('No games today')
else:
    print(matchups)
    print(game_ids)



Wednesday, January 1
Thursday, January 2
Friday, January 3
Saturday, January 4
Sunday, January 5
Monday, January 6
Tuesday, January 7
Wednesday, January 8
Thursday, January 9
Friday, January 10
Saturday, January 11
Sunday, January 12
Monday, January 13
Tuesday, January 14
Wednesday, January 15
Thursday, January 16
Friday, January 17
Saturday, January 18
Sunday, January 19
Monday, January 20
Tuesday, January 21
Wednesday, January 22
Thursday, January 23
Friday, January 24
Saturday, January 25
Sunday, January 26
Monday, January 27
Tuesday, January 28
Wednesday, January 29
Thursday, January 30
Friday, January 31
No games today


**Close Webdriver**

In [6]:
if WEBSCRAPER == 'SELENIUM':
    driver.close() 

**Check if anything is going on in the season**

In [7]:
UPDATE_WF = True

if (df_new.empty) and (matchups is None):
    print('No new games to process')

    UPDATE_WF = False
    # exit()
    

No new games to process


**Create Rows for Today's Games with Empty Stats**

In [8]:
# reformat today's matchups to the new games dataframe

if matchups is None:
    print('No games going on. Nothing to do.')
    # exit()    

else:

    df_today = df_new.drop(df_new.index) #empty copy of df_new with same columns
    for i, matchup in enumerate(matchups):
        game_details = {'HOME_TEAM_ID': matchup[1], 
                        'VISITOR_TEAM_ID': matchup[0], 
                        'GAME_DATE_EST': datetime.now(timezone('EST')).strftime("%Y-%m-%d"), 
                        'GAME_ID': int(game_ids[i]),                       
                        'SEASON': SEASON,
                        } 
        game_details_df = pd.DataFrame(game_details, index=[i])
        # append to new games dataframe
        df_today = pd.concat([df_today, game_details_df], ignore_index = True)

    #blank rows will be filled with 0 to prevent issues with feature engineering
    df_today = df_today.fillna(0) 

    df_today



No games going on. Nothing to do.


**Access Feature Store**

In [9]:
if UPDATE_WF:

    project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)

    # HOPSWORKS can be kinda buggy and has been throwing a lot of errors recently or even just failing to return data
    # so I'm adding a try/except block to retry the query if it fails
    tries = 5

    for i in range(tries):
        
        try:
            fs = project.get_feature_store()
        except KeyError as e:
            if i < tries - 1: # i is zero indexed
                time.sleep(30)
                continue
            else:
                raise ValueError('HOPSWORKS failed to connect')
        break



**Access Feature Group**

In [10]:
# HOPSWORKS can be kinda buggy and has been throwing a lot of errors recently or even just failing to return data
# so I'm adding a try/except block to retry the query if it fails
tries = 5

if UPDATE_WF:
    for i in range(tries):
        
        try:
            rolling_stats_fg = fs.get_feature_group(
            name="rolling_stats",
            version=FEATURE_GROUP_VERSION,
            )
        except KeyError as e:
            if i < tries - 1: # i is zero indexed
                time.sleep(30)
                continue
            else:
                raise ValueError('HOPSWORKS failed to connect')
        break



**Query Old Data Needed for Feature Engineering of New Data**

To generate features like rolling averages for the new games, older data from previous games is needed since some of the rolling averages might extend back 15 or 20 games or so.

In [11]:
BASE_FEATURES = ['game_date_est',
 'game_id',
 'home_team_id',
 'visitor_team_id',
 'season',
 'pts_home',
 'fg_pct_home',
 'ft_pct_home',
 'fg3_pct_home',
 'ast_home',
 'reb_home',
 'pts_away',
 'fg_pct_away',
 'ft_pct_away',
 'fg3_pct_away',
 'ast_away',
 'reb_away',
 'home_team_wins',
]

if UPDATE_WF:
    ds_query = rolling_stats_fg.select(BASE_FEATURES)

    # HOPSWORKS can be kinda buggy and has been throwing a lot of errors recently or even just failing to return data
    # so I'm adding a try/except block to retry the query if it fails
    tries = 5

    for i in range(tries):
        for j in range(tries):
            try:
                df_old = ds_query.read()
            except KeyError as e:
                if j < tries - 1: 
                    time.sleep(10)
                    continue
                else:
                    raise ValueError('HOPSWORKS failed to connect')
            break

        if df_old.empty:
            if i < tries - 1: 
                time.sleep(10)
            else:
                raise ValueError('HOPSWORKS failed to return data')
        else:
            break



**Convert Feature Names back to original mixed case**

In [12]:
#hopsworks converts all feature names to lowercase, and for code reuse, we need to convert them back
if UPDATE_WF:
    df_old = convert_feature_names(df_old)
    df_old
    df_old[df_old['PTS_home'] == 0]
    df_old

**Update Yesterday's Matchup Predictions with New Final Results**

In [13]:
# filter out games that are pending final results
# (these were the rows used for prediction yesterday)
# and then update these with the new results


# one approach is to simply drop the rows that were used for prediction yesterday
# which are games that have 0 points for home team
# and then append the new rows to the dataframe
if UPDATE_WF:

    df_old = df_old[df_old['PTS_home'] != 0]
    df_old = pd.concat([df_old, df_new], ignore_index = True)
    df_old

**Add Today's Matchups for Feature Engineering**

In [14]:
if UPDATE_WF:

    if matchups is None:
        print('No games today')
        df_combined = df_old
    else:
        df_combined = pd.concat([df_old, df_today], ignore_index = True)
        df_combined

**Data Processing**

In [15]:
if UPDATE_WF:
    df_combined = process_games(df_combined) 
    df_combined = add_TARGET(df_combined)
    df_combined

**Feature Engineering**

In [16]:
# Feature engineering to add: 
    # rolling averages of key stats, 
    # win/lose streaks, 
    # home/away streaks, 
    # specific matchup (team X vs team Y) rolling averages and streaks
if UPDATE_WF:
    df_combined = process_features(df_combined)



    #fix type conversion issues with hopsworks
    df_combined['TARGET'] = df_combined['TARGET'].astype('int16')
    df_combined['HOME_TEAM_WINS'] = df_combined['HOME_TEAM_WINS'].astype('int16')
    df_combined


**Insert New Data into Feature Group**

In [17]:
# HOPSWORKS can be kinda buggy and has been throwing a lot of errors recently or even just failing to return data
# so I'm adding a try/except block to retry the query if it fails
tries = 5

if UPDATE_WF:
    for i in range(tries):
        
        try:
            rolling_stats_fg.insert(df_combined, overwrite = True, write_options={"wait_for_job" : False})
        except KeyError as e:
            if i < tries - 1: 
                time.sleep(30)
                continue
            else:
                raise ValueError('HOPSWORKS failed to connect')
        break

