In [1]:
import pandas as pd

# Data loading and simple exploration

In [2]:
data = pd.read_csv('../data/03_primary/preprocessed_data.csv')
data.head()

Unnamed: 0,scaled_amount,scaled_time,V1,V2,V3,V4,V5,V6,V7,V8,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
0,12.687766,-0.404316,-0.342098,-3.588867,-1.53813,-0.246931,-0.956828,0.771315,0.656323,-0.094378,...,1.863204,0.683511,-0.116899,-1.111424,-1.276077,0.488605,-0.062315,-0.179958,0.133955,0
1,-0.213233,0.821285,0.667714,3.041502,-5.845112,5.967587,0.213863,-1.462923,-2.688761,0.677764,...,0.558425,0.32976,-0.941383,-0.006075,-0.958925,0.239298,-0.067356,0.821048,0.426175,1
2,-0.022078,-0.407195,1.096441,0.247292,1.449417,2.652978,-0.773799,0.046267,-0.483269,0.126451,...,-0.112556,-0.033605,-0.050347,0.033304,0.383234,0.2633,-0.051038,0.034167,0.039962,0
3,9.8639,0.637343,-2.356348,1.74636,-6.374624,1.772205,-3.439294,1.457811,-0.362577,1.443791,...,0.19481,0.857942,0.621203,0.964817,-0.619437,-1.732613,0.108361,1.130828,0.415703,1
4,-0.29344,-0.902278,-1.585505,3.261585,-4.137422,2.357096,-1.405043,-1.879437,-3.513687,1.515607,...,0.315957,0.501543,-0.546869,-0.076584,-0.42555,0.123644,0.321985,0.264028,0.132817,1


The data is already preprocessed and it looks like there is no room for further feature engineering.<br>
<br>
That way, the only thing left to do is to push all the columns to our feature store.

# Hopsworks 

## Upload

In [7]:
from dotenv import load_dotenv
import os

In [8]:
load_dotenv()

True

In [11]:
import hopsworks
from great_expectations.core import ExpectationSuite

In [107]:
from keyword import iskeyword

In [112]:
iskeyword('class_')

False

In [113]:
def to_feature_store(
    df: pd.DataFrame,
    group_name: str,
    feature_group_version: int,
    description: str,
    group_description: dict,
    validation_expectation_suite: ExpectationSuite = None
):
    '''
    This function takes in a pandas DataFrame and a validation expectation suite,
      performs validation on the data using the suite, and then saves the data to a
      feature store in the feature store.

    Args:
        - data (pd.DataFrame): Dataframe with the data to be stored
        - group_name (str): Name of the feature group.
        - feature_group_version (int): Version of the feature group.
        - description (str): Description for the feature group.
        - group_description (dict): Description of each feature of the feature group. 
        - validation_expectation_suite (ExpectationSuite): group of expectations to check data.
        
    Returns:
        - A dictionary with the feature view version, feature view name and training dataset feature version.
    '''
    ##### DELETE THIS BIT AFTER GX IS IMPLEMENTED #####
    if validation_expectation_suite:
        raise NotImplementedError
    ###################################################
    
    if not isinstance(df, pd.DataFrame) and not isinstance(df, pd.Series):
        raise TypeError(f'Expect pd.DataFrame, got {type(df)}')
    if isinstance(df, pd.Series):
        new_df = pd.DataFrame({'index': df.index, df.name: df})
        df = new_df
    
    # Create primary key to posteriorly joins
    if 'index' not in df.columns:
        df = df.reset_index()

    # Hopsworks only accepts lowercase column names, better to sanitize beforehand. Also, try to protect from Python's reserved words
    df.columns = list(map(lambda x: x.lower() if not iskeyword(x.lower()) else x.lower() + '_', df.columns))
    
    # Get credentials
    project_name = os.environ.get('FS_PROJECT_NAME')
    api_key = os.environ.get('FS_API_KEY')
    
    # Connect to feature store.
    project = hopsworks.login(
        api_key_value=api_key, project=project_name
    )
    feature_store = project.get_feature_store()

    # Create feature group.
    object_feature_group = feature_store.get_or_create_feature_group(
        name=group_name,
        version=feature_group_version,
        primary_key=['index'],
        description= description,
        online_enabled=False,
        expectation_suite=validation_expectation_suite
    )
    
    # Upload data.
    object_feature_group.insert(
        features=df,
        overwrite=False,
        write_options={
            "wait_for_job": True,
        },
    )

    # Add feature descriptions.
    for description in group_description:
        object_feature_group.update_feature_description(
            description["name"], description["description"]
        )

    # Update statistics.
    object_feature_group.statistics_config = {
        "enabled": True,
        "histograms": True,
        "correlations": True,
    }
    object_feature_group.update_statistics_config()
    object_feature_group.compute_statistics()

    return object_feature_group

### Test run

In [18]:
data.columns

Index(['scaled_amount', 'scaled_time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6',
       'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16',
       'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26',
       'V27', 'V28', 'Class'],
      dtype='object')

### Feature descriptions

In [82]:
feature_descriptions = [{'name': f'v{i}', 'description': 'Anonymized credit card data', 'validation_rules': 'TO DETERMINE'} for i in range(1, 29)] 
feature_descriptions += [
    {'name': 'scaled_amount', 'description': 'Scaled amount of transaction', 'validation_rules': 'TO DETERMINE'},
    {'name': 'scaled_time', 'description': 'Scaled amount of time, relative to first transaction observation', 'validation_rules': 'TO DETERMINE'},
    {'name': 'index', 'description': 'Index of the observations', 'validation_rules': 'Positive integer, unique'},
]

feature_descriptions

[{'name': 'v1',
  'description': 'Anonymized credit card data',
  'validation_rules': 'TO DETERMINE'},
 {'name': 'v2',
  'description': 'Anonymized credit card data',
  'validation_rules': 'TO DETERMINE'},
 {'name': 'v3',
  'description': 'Anonymized credit card data',
  'validation_rules': 'TO DETERMINE'},
 {'name': 'v4',
  'description': 'Anonymized credit card data',
  'validation_rules': 'TO DETERMINE'},
 {'name': 'v5',
  'description': 'Anonymized credit card data',
  'validation_rules': 'TO DETERMINE'},
 {'name': 'v6',
  'description': 'Anonymized credit card data',
  'validation_rules': 'TO DETERMINE'},
 {'name': 'v7',
  'description': 'Anonymized credit card data',
  'validation_rules': 'TO DETERMINE'},
 {'name': 'v8',
  'description': 'Anonymized credit card data',
  'validation_rules': 'TO DETERMINE'},
 {'name': 'v9',
  'description': 'Anonymized credit card data',
  'validation_rules': 'TO DETERMINE'},
 {'name': 'v10',
  'description': 'Anonymized credit card data',
  'valid

In [115]:
class_description = [
    {'name': 'index', 'description': 'Index of the observations', 'validation_rules': 'Positive integer, unique'},
    {'name': 'class_', 'description': 'Predicted class of the observation. 1 for fraud, 0 otherwise', 'validation_rules': '0 or 1'}
]

In [84]:
X = data.drop(columns=['Class'])
y = data['Class']

In [85]:
to_feature_store(
    df=X, group_name='features',
    feature_group_version=1, description='Test run of features',
    group_description=feature_descriptions
)

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/824468
Connected. Call `.close()` to terminate connection gracefully.


Uploading Dataframe: 0.00% |          | Rows 0/946 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: features_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/824468/jobs/named/features_1_offline_fg_materialization/executions


<hsfs.feature_group.FeatureGroup at 0x7b1253e48890>

In [116]:
to_feature_store(
    df=y, group_name='target',
    feature_group_version=5, description='Test run of target',
    group_description=class_description
)

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/824468
Connected. Call `.close()` to terminate connection gracefully.


Uploading Dataframe: 0.00% |          | Rows 0/946 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: target_5_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/824468/jobs/named/target_5_offline_fg_materialization/executions
Statistics Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/824468/jobs/named/target_5_compute_stats_22062024193250/executions


<hsfs.feature_group.FeatureGroup at 0x7b1253e067d0>

## Download

In [67]:
def get_features(
    group_name: str,
    version: int
):
    '''
    This function takes in the group name of the desired features in Hopsworks and returns ad pd.DataFrame with them.

    Args:
        - group_name (str): Name of the feature group.
        - version (int): Version number of feature group.
        
    Returns:
        - A pd.DataFrame with the features.
    '''
    project_name = os.environ.get('FS_PROJECT_NAME')
    api_key = os.environ.get('FS_API_KEY')
    
    project = hopsworks.login(api_key_value=api_key, project=project_name)
    fs = project.get_feature_store()

    features = fs.get_feature_group(name=group_name, version=version)
    df = features.read()

    return df

In [68]:
get_features(
    'features',
    1
)

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/824468
Connected. Call `.close()` to terminate connection gracefully.
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.15s) 


Unnamed: 0,index,scaled_amount,scaled_time,v1,v2,v3,v4,v5,v6,v7,...,v19,v20,v21,v22,v23,v24,v25,v26,v27,v28
0,866,0.258506,0.738225,2.152649,-0.779229,-1.648625,-0.752541,-0.126452,-0.373368,-0.345783,...,-0.352941,-0.492878,-0.131528,0.107782,-0.039794,-1.001198,0.025282,0.835312,-0.072340,-0.078686
1,848,-0.293440,-0.401191,1.107552,0.598386,-0.371871,1.133997,0.540603,0.029619,0.072154,...,-1.061886,-0.132723,0.043342,0.245496,-0.079157,-0.412484,0.523173,-0.286496,0.066487,0.041147
2,21,-0.290924,0.046758,-3.859881,2.632881,-5.264265,3.446113,-0.675231,-1.904959,-3.291041,...,2.993174,-0.178626,1.664119,0.785075,0.068412,0.778961,-0.863166,-0.006810,-1.065734,1.773326
3,762,4.752323,0.804180,1.665951,-2.351415,-1.623675,-1.707280,-0.894869,0.197310,-0.579855,...,0.612017,0.387020,-0.426844,-1.593740,0.072421,-1.503793,-0.620243,-0.656890,-0.048224,-0.000580
4,294,-0.293440,-0.890588,-2.535852,5.793644,-7.618463,6.395830,-0.065210,-3.136372,-3.104557,...,-1.385558,0.408704,0.716720,-0.448060,-0.402407,-0.288835,1.011752,0.425965,0.413140,0.308205
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
941,707,-0.279746,-0.541994,-0.832170,1.060013,1.153785,-0.139907,-0.324930,-0.205704,0.021504,...,-0.044492,-0.171464,-0.131792,-0.549120,0.071821,-0.059137,-0.343535,0.093132,0.139462,0.040788
942,239,-0.135401,0.945030,-7.503926,-0.360628,-3.830952,2.486103,2.497367,1.332437,-6.783964,...,2.548313,-0.475126,-6.389132,2.249964,1.670508,0.140450,0.162147,1.207731,1.268958,0.097538
943,171,1.201705,0.619685,-1.715387,-1.385403,1.944293,-1.823432,0.725480,-1.495315,0.022411,...,-0.446665,0.425611,0.264360,0.292934,-0.008440,-0.025988,0.546103,0.721269,-0.287939,-0.150978
944,874,-0.293440,-0.516336,-2.377533,0.520539,-8.094139,8.005351,2.640750,-3.381586,-1.934372,...,0.092973,-0.634747,0.148284,0.721100,2.661291,-0.508620,-0.401657,0.587611,0.500326,0.551760


In [123]:
get_features(
    'target',
    5
)

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/824468
Connected. Call `.close()` to terminate connection gracefully.
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.74s) 


Unnamed: 0,index,class_
0,866,0
1,848,0
2,21,1
3,762,0
4,294,1
...,...,...
941,707,0
942,239,1
943,171,0
944,874,1
