### Production Features Pipeline

In [1]:
# select web scraper; 'SCRAPINGANT' or 'SELENIUM'
# SCRAPINGANT requires a subscription but includes a proxy server

# WEBSCRAPER = 'SCRAPINGANT'
WEBSCRAPER = "SELENIUM"

In [2]:
import os

import pandas as pd
import numpy as np

import hopsworks

from datetime import datetime, timedelta
from pytz import timezone

import json

import time

from pathlib import Path  # for Windows/Linux compatibility

# change working directory to project root when running from notebooks folder to make it easier to import modules
# and to access sibling folders
print(os.path.basename(os.getcwd()))
if os.path.basename(os.getcwd()) == "notebooks":

    INDIRECT = False

    os.chdir(
        ".."
    ) 
print(os.path.basename(os.getcwd()))

from src.utils.webscraping import (
    get_new_games,
    activate_web_driver,
    get_todays_matchups,
)

from src.data.cleaning import (
    process_games,
    add_TARGET,
)

from src.data.build_features import (
    process_features,
)

from src.utils.hopsworks_utils import (
    save_feature_names,
    convert_feature_names,
)

from src.utils.constants import (
    FEATURE_GROUP_VERSION,
)

DATAPATH = Path(r"data")

notebooks
BasketBall-prediction


**Load API keys**

In [3]:
from dotenv import load_dotenv

load_dotenv()

try:
    HOPSWORKS_API_KEY = os.environ["HOPSWORKS_API_KEY"]
except:
    raise Exception("Set environment variable HOPSWORKS_API_KEY")

# if scrapingant is chosen then set the api key, otherwise load the selenium webdriver
if WEBSCRAPER == "SCRAPINGANT":
    try:
        SCRAPINGANT_API_KEY = os.environ["SCRAPINGANT_API_KEY"]
    except:
        raise Exception("Set environment variable SCRAPINGANT_API_KEY")
    driver = None

elif WEBSCRAPER == "SELENIUM":
    driver = activate_web_driver()
    SCRAPINGANT_API_KEY = ""

**Scrape New Completed Games and Format Them**

In [4]:
df_new = get_new_games(SCRAPINGANT_API_KEY, driver)

if df_new.empty:
    print("No new games to process")
else:

    # get the SEASON of the last game in the database
    # this will used when constructing rows for prediction
    SEASON = df_new["SEASON"].max()

    print(df_new.head())


Current month is 03
Scraping https://www.nba.com/stats/teams/boxscores?SeasonType=Regular+Season&DateFrom=03/14/25&DateTo=03/21/25


No new games to process


**Retrieve todays games**

In [5]:
# retrieve list of teams playing today

# get today's games on NBA schedule
matchups, game_ids = get_todays_matchups(SCRAPINGANT_API_KEY, driver)

if matchups is None:
    print("No games today")
else:
    print(matchups)
    print(game_ids)

Friday, March 21
[['1610612753', '1610612764'], ['1610612745', '1610612748'], ['1610612740', '1610612750'], ['1610612766', '1610612760'], ['1610612755', '1610612759'], ['1610612765', '1610612742'], ['1610612738', '1610612762'], ['1610612739', '1610612756'], ['1610612743', '1610612757'], ['1610612763', '1610612746']]
['22401015', '22401016', '22401017', '22401018', '22401019', '22401020', '22401021', '22401022', '22401023']


**Close Webdriver**

In [6]:
if WEBSCRAPER == "SELENIUM":
    driver.close()

**Check if anything is going on in the season**

In [7]:
UPDATE_WF = True

if (df_new.empty) and (matchups is None):
    print("No new games to process")

    UPDATE_WF = False
    exit()

**Access Feature Store**

In [8]:
print(HOPSWORKS_API_KEY[:5])

NJgan
