## <span style='color:#ff5f27'> 📝 Imports </span>

In [None]:
import pandas as pd
import numpy as np

# Mute warnings
import warnings
warnings.filterwarnings("ignore")

## <span style="color:#ff5f27;"> 💽 Data Loading</span>

In this case, you are predicting the waiting time for a deceased donor kidney transplant involves estimating the duration a patient might need to wait from the time they are registered on the transplant list until a suitable donor kidney becomes available for transplantation.

In [None]:
patient_demographics_data = pd.read_csv(
    'https://repo.hops.works/dev/davit/hospital_wait_time/patient_demographics.csv', 
    parse_dates=['date'],
)
patient_demographics_data.head(3)

In [None]:
medical_background_data = pd.read_csv(
    'https://repo.hops.works/dev/davit/hospital_wait_time/medical_background.csv', 
    parse_dates=['date'],
)
medical_background_data.head(3)

In [None]:
transplant_compatibility_data = pd.read_csv(
    'https://repo.hops.works/dev/davit/hospital_wait_time/transplant_compatibility.csv', 
    parse_dates=['date'],
)
transplant_compatibility_data.columns = transplant_compatibility_data.columns.str.lower()
transplant_compatibility_data.head(3)

## <span style="color:#ff5f27;"> 👨🏻‍🍳 Data Preparation</span>


In [None]:
patient_demographics_data.isna().sum()[patient_demographics_data.isna().sum() > 0] / len(patient_demographics_data)*100

In [None]:
medical_background_data.isna().sum()[medical_background_data.isna().sum() > 0] / len(medical_background_data)*100

In [None]:
transplant_compatibility_data.isna().sum()[transplant_compatibility_data.isna().sum() > 0] / len(transplant_compatibility_data)*100

In [None]:
medical_background_data['dialysis_duration'] = medical_background_data['dialysis_duration'].fillna(1).replace(0, 1)
medical_background_data['dialysis_duration'] = np.log(medical_background_data['dialysis_duration'] + 1)

In [None]:
def remove_outliers_iqr(dataframe, iqr_multiplier=1.5):
    # Select numerical columns for outlier removal
    numerical_columns = dataframe.select_dtypes(
        include=['int64', 'float64']).columns

    # Loop through numerical columns to identify and remove outliers using IQR
    for column in numerical_columns:
        Q1 = dataframe[column].quantile(0.25)
        Q3 = dataframe[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - iqr_multiplier * IQR
        upper_bound = Q3 + iqr_multiplier * IQR

        outliers = dataframe[(dataframe[column] < lower_bound) | (
            dataframe[column] > upper_bound)]

        # Remove outliers
        dataframe = dataframe[~dataframe.index.isin(outliers.index)]

    return dataframe

In [None]:
patient_demographics_data_filtered = remove_outliers_iqr(patient_demographics_data, iqr_multiplier=1.5)
print(f'⛳️ Original shape: {patient_demographics_data.shape}')
print(f'⛳️ Cleared shape: {patient_demographics_data_filtered.shape}')

In [None]:
medical_background_data_filtered = remove_outliers_iqr(medical_background_data, iqr_multiplier=1.5)
print(f'⛳️ Original shape: {medical_background_data.shape}')
print(f'⛳️ Cleared shape: {medical_background_data_filtered.shape}')

In [None]:
transplant_compatibility_data_filtered = remove_outliers_iqr(transplant_compatibility_data, iqr_multiplier=1.5)
print(f'⛳️ Original shape: {transplant_compatibility_data.shape}')
print(f'⛳️ Cleared shape: {transplant_compatibility_data_filtered.shape}')

## <span style="color:#ff5f27;">👮🏻‍♂️ Great Expectations </span>

In [None]:
import great_expectations as ge
from great_expectations.core import ExpectationSuite, ExpectationConfiguration

In [None]:
# Convert your DataFrame to a Great Expectations DataFrame
ge_df_patient_demographics = ge.from_pandas(patient_demographics_data_filtered)

# Retrieve the expectation suite associated with the ge DataFrame
expectation_suite_patient_demographics = ge_df_patient_demographics.get_expectation_suite()

# Set the expectation suite name
expectation_suite_patient_demographics.expectation_suite_name = "patient_registration_suite"

# Expectation: 'id' should always be unique and not null
expectation_suite_patient_demographics.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_unique",
        kwargs={"column": "id"},
    )
)
expectation_suite_patient_demographics.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={"column": "id"},
    )
)

# Expectation: 'date' should be a valid date and not null
expectation_suite_patient_demographics.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_of_type",
        kwargs={
            "column": "date",
            "type_": "datetime64[ns]",
        }
    )
)
expectation_suite_patient_demographics.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={"column": "date"},
    )
)

# Expectation: 'age_at_list_registration' to be non-negative
expectation_suite_patient_demographics.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column": "age_at_list_registration",
            "min_value": 0,
            "max_value": None,
        }
    )
)

# Expectation: 'gender' to be within expected values
expectation_suite_patient_demographics.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_in_set",
        kwargs={
            "column": "gender",
            "value_set": ["M", "F"],
        }
    )
)

# Expectation: 'age_cat' to contain expected categories
expectation_suite_patient_demographics.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_in_set",
        kwargs={
            "column": "age_cat",
            "value_set": ["Over60", "From18to60", "Below18"],
        }
    )
)

print("✅ Expectations defined and saved successfully.")

In [None]:
ge_df_medical_background = ge.from_pandas(medical_background_data_filtered)

# Retrieve and set the expectation suite
expectation_suite_medical_background = ge_df_medical_background.get_expectation_suite()
expectation_suite_medical_background.expectation_suite_name = "medical_background_suite"

# Expectations for 'id' and 'date'
expectation_suite_medical_background.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_unique",
        kwargs={"column": "id"},
    )
)
expectation_suite_medical_background.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={"column": "id"},
    )
)
expectation_suite_medical_background.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_of_type",
        kwargs={
            "column": "date",
            "type_": "datetime64[ns]",
        }
    )
)
expectation_suite_medical_background.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={"column": "date"},
    )
)

# Expectation for 'dialysis_duration'
expectation_suite_medical_background.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column": "dialysis_duration",
            "min_value": 0,
            "max_value": None,
        }
    )
)

# Expectation for 'blood_gp'
expectation_suite_medical_background.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_in_set",
        kwargs={
            "column": "blood_gp",
            "value_set": ["A", "B", "AB", "O"],
        }
    )
)

# Gestation and Prior Transplant Expectations
for column in ["gestation", "prior_transplant"]:
    expectation_suite_medical_background.add_expectation(
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_set",
            kwargs={
                "column": column,
                "value_set": ["YES", "NO"],
            }
        )
    )

# Expectation for 'number_prior_transplant' - check alignment with 'prior_transplant'
expectation_suite_medical_background.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column": "number_prior_transplant",
            "min_value": 0,
            "max_value": None,
        }
    )
)

print("✅ Expectations defined and saved successfully.")

In [None]:
ge_df_transplant_compatibility = ge.from_pandas(transplant_compatibility_data_filtered)

# Retrieve and set the expectation suite
expectation_suite_transplant_compatibility = ge_df_transplant_compatibility.get_expectation_suite()
expectation_suite_transplant_compatibility.expectation_suite_name = "transplant_compatibility_and_outcome_suite"

# Expectations for 'id' and 'date'
expectation_suite_transplant_compatibility.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_unique",
        kwargs={"column": "id"},
    )
)
expectation_suite_transplant_compatibility.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={"column": "id"},
    )
)
expectation_suite_transplant_compatibility.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_of_type",
        kwargs={
            "column": "date",
            "type_": "datetime64[ns]",
        }
    )
)

# Expectation for 'cPRA' to be between 0 and 100
expectation_suite_transplant_compatibility.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column": "cpra",
            "min_value": 0,
            "max_value": 100,
        }
    )
)

# HLA Marker Expectations (checking they are non-negative integers)
for hla_marker in ["hla_a1", "hla_a2", "hla_b1", "hla_b2", "hla_dr1", "hla_dr2"]:
    expectation_suite_transplant_compatibility.add_expectation(
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_of_type",
            kwargs={
                "column": hla_marker,
                "type_": "int",
            }
        )
    )

# Expectation for 'if_transplanted'
expectation_suite_transplant_compatibility.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_in_set",
        kwargs={
            "column": "if_transplanted",
            "value_set": ["YES", "NO"],
        }
    )
)

# Expectation for 'duration' to be non-negative
expectation_suite_transplant_compatibility.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column": "duration",
            "min_value": 0,
            "max_value": None,
        }
    )
)

print("✅ Expectations defined and saved successfully.")

## <span style="color:#ff5f27;"> 📡 Connecting to Hopsworks Feature Store </span>

In [None]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store()

## <span style="color:#ff5f27;"> 🪄 Creating Feature Groups </span>


In [None]:
# Get or create the 'patient_info' feature group
patient_info_fg = fs.get_or_create_feature_group(
    name="patient_info",
    version=1,
    description="Demographic Features",
    primary_key=["id"],
    event_time="date",
    expectation_suite=expectation_suite_patient_demographics,
)

patient_info_fg.insert(patient_demographics_data_filtered)
print('✅ Done')

In [None]:
# Get or create the 'medical_info' feature group
medical_info_fg = fs.get_or_create_feature_group(
    name="medical_info",
    version=1,
    description="Medical background features",
    primary_key=["id"],
    event_time="date",
    expectation_suite=expectation_suite_medical_background,
)

medical_info_fg.insert(medical_background_data_filtered)
print('✅ Done')

In [None]:
# Get or create the 'transplant_compatibility' feature group
transplant_compatibility_fg = fs.get_or_create_feature_group(
    name="transplant_compatibility",
    version=1,
    description="Transplant compatibility features",
    primary_key=["id"],
    event_time="date",
    expectation_suite=expectation_suite_transplant_compatibility,
)

transplant_compatibility_fg.insert(transplant_compatibility_data_filtered)
print('✅ Done')

---