In [1]:
import pandas as pd

# Data loading and simple exploration

In [2]:
data = pd.read_csv('../data/03_primary/preprocessed_data.csv')
data.head()

Unnamed: 0,scaled_amount,scaled_time,V1,V2,V3,V4,V5,V6,V7,V8,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
0,12.687766,-0.404316,-0.342098,-3.588867,-1.53813,-0.246931,-0.956828,0.771315,0.656323,-0.094378,...,1.863204,0.683511,-0.116899,-1.111424,-1.276077,0.488605,-0.062315,-0.179958,0.133955,0
1,-0.213233,0.821285,0.667714,3.041502,-5.845112,5.967587,0.213863,-1.462923,-2.688761,0.677764,...,0.558425,0.32976,-0.941383,-0.006075,-0.958925,0.239298,-0.067356,0.821048,0.426175,1
2,-0.022078,-0.407195,1.096441,0.247292,1.449417,2.652978,-0.773799,0.046267,-0.483269,0.126451,...,-0.112556,-0.033605,-0.050347,0.033304,0.383234,0.2633,-0.051038,0.034167,0.039962,0
3,9.8639,0.637343,-2.356348,1.74636,-6.374624,1.772205,-3.439294,1.457811,-0.362577,1.443791,...,0.19481,0.857942,0.621203,0.964817,-0.619437,-1.732613,0.108361,1.130828,0.415703,1
4,-0.29344,-0.902278,-1.585505,3.261585,-4.137422,2.357096,-1.405043,-1.879437,-3.513687,1.515607,...,0.315957,0.501543,-0.546869,-0.076584,-0.42555,0.123644,0.321985,0.264028,0.132817,1


The data is already preprocessed and it looks like there is no room for further feature engineering.<br>
<br>
That way, the only thing left to do is to push all the columns to our feature store.

# Hopsworks 

In [7]:
from dotenv import load_dotenv
import os

In [8]:
load_dotenv()

True

In [11]:
import hopsworks
from great_expectations.core import ExpectationSuite

In [59]:
def to_feature_store(
    df: pd.DataFrame,
    group_name: str,
    feature_group_version: int,
    description: str,
    group_description: dict,
    validation_expectation_suite: ExpectationSuite = None
):
    '''
    This function takes in a pandas DataFrame and a validation expectation suite,
      performs validation on the data using the suite, and then saves the data to a
      feature store in the feature store.

    Args:
        - data (pd.DataFrame): Dataframe with the data to be stored
        - group_name (str): Name of the feature group.
        - feature_group_version (int): Version of the feature group.
        - description (str): Description for the feature group.
        - group_description (dict): Description of each feature of the feature group. 
        - validation_expectation_suite (ExpectationSuite): group of expectations to check data.
        
    Returns:
        - A dictionary with the feature view version, feature view name and training dataset feature version.
    '''
    ##### DELETE THIS BIT AFTER GX IS IMPLEMENTED #####
    if validation_expectation_suite:
        raise NotImplementedError
    ###################################################
    
    if not isinstance(df, pd.DataFrame) and not isinstance(df, pd.Series):
        raise TypeError(f'Expect pd.DataFrame, got {type(df)}')
    if isinstance(df, pd.Series):
        df = pd.DataFrame(df)
    
    if 'index' not in df.columns:
        df = df.reset_index()

    # Get credentials
    project_name = os.environ.get('FS_PROJECT_NAME')
    api_key = os.environ.get('FS_API_KEY')
    
    # Connect to feature store.
    project = hopsworks.login(
        api_key_value=api_key, project=project_name
    )
    feature_store = project.get_feature_store()

    # Create feature group.
    object_feature_group = feature_store.get_or_create_feature_group(
        name=group_name,
        version=feature_group_version,
        primary_key=['index'],
        description= description,
        online_enabled=False,
        expectation_suite=validation_expectation_suite
    )
    
    # Upload data.
    object_feature_group.insert(
        features=df,
        overwrite=False,
        write_options={
            "wait_for_job": True,
        },
    )

    # Add feature descriptions.
    for description in group_description:
        object_feature_group.update_feature_description(
            description["name"], description["description"]
        )

    # Update statistics.
    object_feature_group.statistics_config = {
        "enabled": True,
        "histograms": True,
        "correlations": True,
    }
    object_feature_group.update_statistics_config()
    object_feature_group.compute_statistics()

    return object_feature_group

## Test run

In [18]:
data.columns

Index(['scaled_amount', 'scaled_time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6',
       'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16',
       'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26',
       'V27', 'V28', 'Class'],
      dtype='object')

### Feature descriptions

In [49]:
feature_descriptions = [{'name': f'v{i}', 'description': 'Anonymized credit card data', 'validation_rules': 'TO DETERMINE'} for i in range(1, 29)] 
feature_descriptions += [
    {'name': 'scaled_amount', 'description': 'Scaled amount of transaction', 'validation_rules': 'TO DETERMINE'},
    {'name': 'scaled_time', 'description': 'Scaled amount of time, relative to first transaction observation', 'validation_rules': 'TO DETERMINE'},
    {'name': 'index', 'description': 'Index of the observations', 'validation_rules': 'Positive integer, unique'},
]

feature_descriptions

[{'name': 'v1',
  'description': 'Anonymized credit card data',
  'validation_rules': 'TO DETERMINE'},
 {'name': 'v2',
  'description': 'Anonymized credit card data',
  'validation_rules': 'TO DETERMINE'},
 {'name': 'v3',
  'description': 'Anonymized credit card data',
  'validation_rules': 'TO DETERMINE'},
 {'name': 'v4',
  'description': 'Anonymized credit card data',
  'validation_rules': 'TO DETERMINE'},
 {'name': 'v5',
  'description': 'Anonymized credit card data',
  'validation_rules': 'TO DETERMINE'},
 {'name': 'v6',
  'description': 'Anonymized credit card data',
  'validation_rules': 'TO DETERMINE'},
 {'name': 'v7',
  'description': 'Anonymized credit card data',
  'validation_rules': 'TO DETERMINE'},
 {'name': 'v8',
  'description': 'Anonymized credit card data',
  'validation_rules': 'TO DETERMINE'},
 {'name': 'v9',
  'description': 'Anonymized credit card data',
  'validation_rules': 'TO DETERMINE'},
 {'name': 'v10',
  'description': 'Anonymized credit card data',
  'valid

In [53]:
class_description = [
    {'name': 'index', 'description': 'Index of the observations', 'validation_rules': 'Positive integer, unique'},
    {'name': 'class', 'description': 'Predicted class of the observation. 1 for fraud, 0 otherwise', 'validation_rules': '0 or 1'}
]

In [51]:
X = data.drop(columns=['Class'])
y = data['Class']

In [52]:
to_feature_store(
    df=X, group_name='features',
    feature_group_version=1, description='Test run of features',
    group_description=feature_descriptions
)

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/824468
Connected. Call `.close()` to terminate connection gracefully.



Uploading Dataframe: 0.00% |          | Rows 0/946 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: features_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/824468/jobs/named/features_1_offline_fg_materialization/executions
Statistics Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/824468/jobs/named/features_1_compute_stats_22062024173025/executions


<hsfs.feature_group.FeatureGroup at 0x7b1268517c90>

In [58]:
to_feature_store(
    df=y, group_name='target',
    feature_group_version=1, description='Test run of target',
    group_description=class_description
)

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/824468
Connected. Call `.close()` to terminate connection gracefully.

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/824468/fs/820291/fg/931170


Uploading Dataframe: 0.00% |          | Rows 0/946 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: target_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/824468/jobs/named/target_1_offline_fg_materialization/executions
Statistics Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/824468/jobs/named/target_1_compute_stats_22062024174523/executions


<hsfs.feature_group.FeatureGroup at 0x7b12685685d0>