In [1]:
HOPSWORKS_PROJECT_NAME = 'nyc_taxiride_demand'

In [2]:
import sys
sys.path.append('../')

import os
from dotenv import load_dotenv
from src.paths import PARENT_DIR

# load key-value pairs from .env file located in the parent directory
load_dotenv(PARENT_DIR / '.env')

HOPSWORKS_API_KEY = os.environ['HOPSWORKS_API_KEY']

In [3]:
from datetime import datetime
import pandas as pd
from src.data import load_raw_data

from_year = 2022
to_year = datetime.now().year
print(f"Downloading raw data from {from_year} to {to_year}")

rides = pd.DataFrame()
for year in range(from_year, to_year + 1):
    # download data for the whole year
    rides_one_year = load_raw_data(year)

    # append rows
    rides = pd.concat([rides, rides_one_year])

Downloading raw data from 2022 to 2025
File 2022-01 was already in local storage
File 2022-02 was already in local storage
File 2022-03 was already in local storage
File 2022-04 was already in local storage
File 2022-05 was already in local storage
File 2022-06 was already in local storage
File 2022-07 was already in local storage
File 2022-08 was already in local storage
File 2022-09 was already in local storage
File 2022-10 was already in local storage
File 2022-11 was already in local storage
File 2022-12 was already in local storage
File 2023-01 was already in local storage
File 2023-02 was already in local storage
File 2023-03 was already in local storage
File 2023-04 was already in local storage
File 2023-05 was already in local storage
File 2023-06 was already in local storage
File 2023-07 was already in local storage
File 2023-08 was already in local storage
File 2023-09 was already in local storage
File 2023-10 was already in local storage
File 2023-11 was already in local sto

In [4]:
print(f"{len(rides) = }")

len(rides) = 163550837


In [5]:
from src.data import transform_raw_data_into_ts_data

ts_data = transform_raw_data_into_ts_data(rides)

100%|██████████| 263/263 [00:21<00:00, 12.51it/s]


In [6]:
import hopsworks

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
project = hopsworks.login(
    project=HOPSWORKS_PROJECT_NAME,
    api_key_value=HOPSWORKS_API_KEY
)

2025-12-31 09:24:55,888 INFO: Initializing external client
2025-12-31 09:24:55,890 INFO: Base URL: https://c.app.hopsworks.ai:443




To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'


2025-12-31 09:24:57,265 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1329302


In [8]:
feature_store = project.get_feature_store()

In [9]:
# To save data into feature store use feature groups api to write data.
# This needs feature group name and version
FEATURE_GROUP_NAME = 'time_series_hourly_feature_group'
FEATURE_GROUP_VERSION = 1

In [10]:
# Create feature group
feature_group = feature_store.get_or_create_feature_group(
    name=FEATURE_GROUP_NAME,
    version=FEATURE_GROUP_VERSION,
    description="Time-series data at hourly frequency",
    primary_key=['pickup_location_id', 'pickup_hour'], # A unique identifier for each row in the data (location_id and hour)
    event_time='pickup_hour' # This is the timestamp behind those events
)

In [11]:
# Validate data before inserting
print("Data shape:", ts_data.shape)
print("\nData types:")
print(ts_data.dtypes)
print("\nFirst few rows:")
print(ts_data.head())
print("\nMissing values:")
print(ts_data.isnull().sum())
print("\nCheck for duplicates in primary key:")
duplicates = ts_data.groupby(['pickup_location_id', 'pickup_hour']).size()
num_duplicates = (duplicates > 1).sum()
print(f"Duplicate primary keys: {num_duplicates}")
if num_duplicates > 0:
    print("Duplicate rows:")
    print(duplicates[duplicates > 1].head(10))

# save data to the feature store
print("\nInserting data to feature store...")
feature_group.insert(ts_data, 
                     write_options={"wait_for_job": False}) # Don't wait for this job to finalize.
print("Data insert job submitted successfully")

Data shape: (9026160, 3)

Data types:
pickup_hour           datetime64[ns]
rides                          int64
pickup_location_id             int64
dtype: object

First few rows:
          pickup_hour  rides  pickup_location_id
0 2022-01-01 00:00:00     11                   4
1 2022-01-01 01:00:00     15                   4
2 2022-01-01 02:00:00     26                   4
3 2022-01-01 03:00:00      8                   4
4 2022-01-01 04:00:00      9                   4

Missing values:
pickup_hour           0
rides                 0
pickup_location_id    0
dtype: int64

Check for duplicates in primary key:
Duplicate primary keys: 0

Inserting data to feature store...
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1329302/fs/1317957/fg/1880534


Uploading Dataframe: 100.00% |██████████| Rows 9026160/9026160 | Elapsed Time: 09:00 | Remaining Time: 00:00


Launching job: time_series_hourly_feature_group_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1329302/jobs/named/time_series_hourly_feature_group_1_offline_fg_materialization/executions
Data insert job submitted successfully
