In [18]:
import os
from datetime import datetime

import pandas as pd
from feast import FeatureStore

In [19]:
raw_data_path = os.path.join("feature_store", "feature_repo", "data", "driver_stats.parquet")
feature_store_path = os.path.join("feature_store", "feature_repo")

### Check data

In [21]:
df = pd.read_parquet(raw_data_path)

In [22]:
df

Unnamed: 0,event_timestamp,driver_id,conv_rate,acc_rate,avg_daily_trips,created
0,2024-10-17 12:07:08.228578+00:00,1001,1.000000,1.000000,1000,2024-10-17 12:07:08.228581
1,2024-10-02 11:00:00+00:00,1005,0.429879,0.194598,582,2024-10-17 11:30:07.072000
2,2024-10-02 12:00:00+00:00,1005,0.230119,0.642878,551,2024-10-17 11:30:07.072000
3,2024-10-02 13:00:00+00:00,1005,0.128600,0.674187,38,2024-10-17 11:30:07.072000
4,2024-10-02 14:00:00+00:00,1005,0.400603,0.473636,583,2024-10-17 11:30:07.072000
...,...,...,...,...,...,...
1803,2024-10-17 09:00:00+00:00,1001,0.704237,0.954645,208,2024-10-17 11:30:07.072000
1804,2024-10-17 10:00:00+00:00,1001,0.295067,0.257097,456,2024-10-17 11:30:07.072000
1805,2021-04-12 07:00:00+00:00,1001,0.709758,0.692957,402,2024-10-17 11:30:07.072000
1806,2024-10-09 23:00:00+00:00,1003,0.708664,0.842933,751,2024-10-17 11:30:07.072000


### Features inference

In [23]:
entity_df = pd.DataFrame.from_dict(
    {
        # entity's join key -> entity values
        "driver_id": [1001, 1002, 1003],
        # "event_timestamp" (reserved key) -> timestamps
        "event_timestamp": [
            datetime(2021, 4, 12, 10, 59, 42),
            datetime(2021, 4, 12, 8, 12, 10),
            datetime(2021, 4, 12, 16, 40, 26),
        ],
        # (optional) label name -> label values. Feast does not process these
        "label_driver_reported_satisfaction": [1, 5, 3],
    }
)

In [24]:
entity_df

Unnamed: 0,driver_id,event_timestamp,label_driver_reported_satisfaction
0,1001,2021-04-12 10:59:42,1
1,1002,2021-04-12 08:12:10,5
2,1003,2021-04-12 16:40:26,3


In [25]:
store = FeatureStore(repo_path=feature_store_path)



In [26]:
training_df = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "driver_hourly_stats:conv_rate",
        "driver_hourly_stats:acc_rate",
        "driver_hourly_stats:avg_daily_trips",
    ],
).to_df()

print("----- Feature schema -----\n")
print(training_df.info())

----- Feature schema -----

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 6 columns):
 #   Column                              Non-Null Count  Dtype              
---  ------                              --------------  -----              
 0   driver_id                           3 non-null      int64              
 1   event_timestamp                     3 non-null      datetime64[ns, UTC]
 2   label_driver_reported_satisfaction  3 non-null      int64              
 3   conv_rate                           3 non-null      float32            
 4   acc_rate                            3 non-null      float32            
 5   avg_daily_trips                     3 non-null      int32              
dtypes: datetime64[ns, UTC](1), float32(2), int32(1), int64(2)
memory usage: 240.0 bytes
None


In [27]:
training_df.head()

Unnamed: 0,driver_id,event_timestamp,label_driver_reported_satisfaction,conv_rate,acc_rate,avg_daily_trips
0,1001,2021-04-12 10:59:42+00:00,1,0.709758,0.692957,402
1,1002,2021-04-12 08:12:10+00:00,5,0.718295,0.584081,370
2,1003,2021-04-12 16:40:26+00:00,3,0.697411,0.19768,25


### Features View - on demand

In [28]:
entity_df = pd.DataFrame.from_dict(
    {
        # entity's join key -> entity values
        "driver_id": [1001, 1002, 1003],
        # "event_timestamp" (reserved key) -> timestamps
        "event_timestamp": [
            datetime(2021, 4, 12, 10, 59, 42),
            datetime(2021, 4, 12, 8, 12, 10),
            datetime(2021, 4, 12, 16, 40, 26),
        ],
        # (optional) label name -> label values. Feast does not process these
        "label_driver_reported_satisfaction": [1, 5, 3],
        # values we're using for an on-demand transformation
        "val_to_add": [1, 2, 3],
        "val_to_add_2": [10, 20, 30], 
    }
)

In [29]:
entity_df.head(5)

Unnamed: 0,driver_id,event_timestamp,label_driver_reported_satisfaction,val_to_add,val_to_add_2
0,1001,2021-04-12 10:59:42,1,1,10
1,1002,2021-04-12 08:12:10,5,2,20
2,1003,2021-04-12 16:40:26,3,3,30


In [30]:
training_df = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "driver_hourly_stats:conv_rate",
        "driver_hourly_stats:acc_rate",
        "driver_hourly_stats:avg_daily_trips",
        "transformed_conv_rate:conv_rate_plus_val1",
        "transformed_conv_rate:conv_rate_plus_val2",
    ],
).to_df()

In [31]:
training_df

Unnamed: 0,driver_id,event_timestamp,label_driver_reported_satisfaction,val_to_add,val_to_add_2,conv_rate,acc_rate,avg_daily_trips,conv_rate_plus_val1,conv_rate_plus_val2
0,1001,2021-04-12 10:59:42+00:00,1,1,10,0.709758,0.692957,402,1.709758,10.709758
1,1002,2021-04-12 08:12:10+00:00,5,2,20,0.718295,0.584081,370,2.718295,20.718295
2,1003,2021-04-12 16:40:26+00:00,3,3,30,0.697411,0.19768,25,3.697411,30.697411


#### Мои примеры 

In [32]:

# Данные с сущностями для запроса признаков
entity_df = pd.DataFrame({
    "driver_id": [1001, 1005],
    "event_timestamp": [
        datetime(2024, 10, 17, 12, 7),
        datetime(2024, 10, 2, 11, 0)
    ]
})

# Получаем исторические признаки из обычных Feature View и on-demand
training_df = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "driver_performance:conv_rate",
        "driver_performance:acc_rate",
        "driver_activity:avg_daily_trips",
        "compute_efficiency_odfv:efficiency_index_odfv",
        "compute_efficiency_odfv:top_performer_flag_odfv"
    ]
).to_df()

print(training_df)

   driver_id           event_timestamp  conv_rate  acc_rate  avg_daily_trips  \
0       1005 2024-10-02 11:00:00+00:00   0.429879  0.194598              582   
1       1001 2024-10-17 12:07:00+00:00   0.295067  0.257097              456   

   efficiency_index_odfv  top_performer_flag_odfv  
0             136.933762                        1  
1              17.314418                        0  


### Мои примеры работы с новыми данными

In [15]:
raw_data_path = os.path.join("feature_store", "feature_repo", "data", 'superstore_sales_.parquet')
df = pd.read_parquet(raw_data_path)

In [16]:
entity_df = pd.DataFrame({
    "customer_id": ["CG-12520", "DV-13045"],
    "order_id": ["CA-2017-152156", "CA-2017-138688"],
    "event_timestamp": pd.to_datetime(["2017-11-08", "2017-06-12"]),  # дата заказа
})


features = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "customer_details:customer_name",
        "customer_details:segment",
        "order_details:sales",
        "order_details:product_id",          
        "order_risk_features:is_high_value_order",
        "order_risk_features:num_items_in_order",
    ],
).to_df()

print(features)

  customer_id        order_id           event_timestamp    customer_name  \
0    DV-13045  CA-2017-138688 2017-06-12 00:00:00+00:00  Darrin Van Huff   
1    CG-12520  CA-2017-152156 2017-11-08 00:00:00+00:00      Claire Gute   

     segment   sales       product_id  is_high_value_order  num_items_in_order  
0  Corporate   14.62  OFF-LA-10000240                    0                   1  
1   Consumer  731.94  FUR-CH-10000454                    1                   1  


In [17]:
from datetime import datetime

entity_df = pd.DataFrame({
    "customer_id": ["BH-11710"],
    "order_id": ["CA-2015-115812"],
    "event_timestamp": pd.to_datetime(["2015-09-06"]),  # дата заказа
})

features_hist = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "customer_details:customer_name",
        "customer_details:segment",
        "order_details:product_id",
    ],
).to_df()

print("📜 Оффлайн фичи (исторические):")
print(features_hist)


📜 Оффлайн фичи (исторические):
  customer_id        order_id           event_timestamp    customer_name  \
0    BH-11710  CA-2015-115812 2015-09-06 00:00:00+00:00  Brosina Hoffman   

    segment       product_id  
0  Consumer  TEC-PH-10002033  
