## Parse the dataset, add primary key, remove negative distances, add timestamp 

In [1]:
from dotenv import load_dotenv
import os
import pandas as pd
import numpy as np
import hopsworks
from datetime import datetime

# Load environment variables from .env file
load_dotenv()

# Get the API key from environment variables
HOPSWORKS_API_KEY = os.getenv('HOPSWORKS_API_KEY')

# Read and clean the data
credit_cards_df = pd.read_csv("Deviation_from_Policy_dataset.csv")

# Create a copy for cleaning
cleaned_df = credit_cards_df.copy()

# Add a transaction_id as primary key
cleaned_df['transaction_id'] = np.arange(len(cleaned_df))

# Replace negative distances with absolute values
cleaned_df['distance_from_home'] = cleaned_df['distance_from_home'].abs()
cleaned_df['distance_from_last_transaction'] = cleaned_df['distance_from_last_transaction'].abs()

# Add timestamp column (since we're dealing with a static dataset, we'll use current time)
cleaned_df['timestamp'] = datetime.now()

## Set up expectations for data validation

In [2]:
import great_expectations as ge
from great_expectations.core import ExpectationSuite, ExpectationConfiguration

ge_df = ge.from_pandas(cleaned_df)
expectation_suite = ge_df.get_expectation_suite()
expectation_suite.expectation_suite_name = "deviation_from_policy_suite"

# Add expectations
expectation_suite.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column": "distance_from_home",
            "min_value": 0.0,
        }
    )
)

expectation_suite.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column": "distance_from_last_transaction",
            "min_value": 0.0,
        }
    )
)

# Add binary column expectations
for column in ['repeat_retailer', 'used_chip', 'used_pin_number', 'online_order', 'fraud']:
    expectation_suite.add_expectation(
        ExpectationConfiguration(
            expectation_type="expect_column_distinct_values_to_be_in_set",
            kwargs={
                "column": column,
                "value_set": [0, 1],
            }
        )
    )



2024-11-01 17:58:32,683 INFO: 	0 expectation(s) included in expectation_suite.


## Create the feature group:

In [15]:
# Connect to Hopsworks
project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
fs = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/25749
Connected. Call `.close()` to terminate connection gracefully.


In [5]:
# Connect to Hopsworks
project = hopsworks.login(api_key_value="HOPSWORKS_API_KEY")
fs = project.get_feature_store()

# Create the feature group
deviation_fg = fs.get_or_create_feature_group(
    name="deviation_from_policy_fg",
    version=1,
    primary_key=['transaction_id'],
    description="Credit card transaction deviation from policy data",
    event_time='timestamp',
    expectation_suite=expectation_suite
)

# Insert the data
deviation_fg.insert(cleaned_df, wait=True)

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/25749
Connected. Call `.close()` to terminate connection gracefully.
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/25749/fs/25669/fg/1328475
2024-11-01 18:01:03,287 INFO: 	7 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/25749/fs/25669/fg/1328475


Uploading Dataframe: 0.00% |          | Rows 0/1599999 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: deviation_from_policy_fg_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/25749/jobs/named/deviation_from_policy_fg_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x311d90c40>,
 {
   "success": true,
   "results": [
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_values_to_be_between",
         "kwargs": {
           "column": "distance_from_home",
           "min_value": 0.0
         },
         "meta": {
           "expectationId": 658437
         }
       },
       "result": {
         "element_count": 1599999,
         "missing_count": 0,
         "missing_percent": 0.0,
         "unexpected_count": 0,
         "unexpected_percent": 0.0,
         "unexpected_percent_total": 0.0,
         "unexpected_percent_nonmissing": 0.0,
         "partial_unexpected_list": []
       },
       "meta": {
         "ingestionResult": "INGESTED",
         "validationTime": "2024-11-01T04:01:03.000287Z"
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       }
     },
     {
       "su

## Create a feature view

In [8]:
# Get the transformation function
standard_scaler = fs.get_transformation_function(name="standard_scaler")

# Select features and create transformations
query = deviation_fg.select_all()

# Create feature view with transformations
feature_view = fs.get_or_create_feature_view(
    name="deviation_from_policy_fv",
    version=1,
    description="Normalized features for credit card fraud detection",
    query=query,
    labels=["fraud"],
    transformation_functions = {
        "distance_from_home": standard_scaler,
        "distance_from_last_transaction": standard_scaler,
        "ratio_to_median_purchase_price": standard_scaler,
    }

)

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/25749/fs/25669/fv/deviation_from_policy_fv/version/1


In [9]:
# Get the training data from the feature view with a simple split
X_train, X_test, y_train, y_test = feature_view.train_test_split(0.8)

# Display the first few rows of training data
print("=== Training Data Shape ===")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print("\n=== First few rows of X_train ===")
print(X_train.head())
print("\n=== First few rows of y_train ===")
print(y_train.head())

# Optional: Display basic statistics of training features
print("\n=== Training Data Statistics ===")
print(X_train.describe())

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (15.03s) 

=== Training Data Shape ===
X_train shape: (319999, 9)
y_train shape: (319999, 1)

=== First few rows of X_train ===
    distance_from_home  distance_from_last_transaction  \
9             -0.08736                       -0.225981   
16            -0.08736                       -0.225981   
24            -0.08736                       -0.225981   
25            -0.08736                       -0.225981   
31            -0.08736                       -0.225981   

    ratio_to_median_purchase_price  repeat_retailer  used_chip  \
9                        -0.156931                1          0   
16                        3.785138                1          0   
24                       -0.156931                1          0   
25                       -0.156931                1          1   
31                       -0.156931                1          0   

    used_pin_number  online_order  transaction_id