In [1]:
import pandas as pd

def load_perturbed_data(file_path):
    """
    Load the perturbed dataset from a CSV file.

    Args:
        file_path (str): Path to the perturbed dataset file.

    Returns:
        pd.DataFrame: Loaded perturbed dataset.
    """
    return pd.read_csv(file_path)

# Example usage
file_path = "../data/perturbed_data_point_42.csv"
perturbed_df = load_perturbed_data(file_path)

In [2]:
import numpy as np

def calculate_similarity(original_point, perturbed_data):
    """
    Calculate similarity weights for each perturbed data point based on the original data point.

    Args:
        original_point (pd.Series): The original data point.
        perturbed_data (pd.DataFrame): The perturbed dataset.

    Returns:
        np.ndarray: Array of weights for each perturbed data point.
    """
    # Calculate Euclidean distances
    distances = np.linalg.norm(perturbed_data.values - original_point.values, axis=1)
    # Convert distances to weights (higher similarity = higher weight)
    weights = np.exp(-distances**2)
    return weights

In [3]:
def add_weights_to_perturbed_data(original_point, perturbed_data):
    """
    Add similarity weights to the perturbed dataset.

    Args:
        original_point (pd.Series): The original data point.
        perturbed_data (pd.DataFrame): The perturbed dataset.

    Returns:
        pd.DataFrame: Perturbed dataset with an added 'Weight' column.
    """
    # Compute similarity weights
    weights = calculate_similarity(original_point, perturbed_data)
    # Normalize weights to sum to 1
    weights /= weights.sum()
    # Add weights as a new column
    perturbed_data['Weight'] = weights
    return perturbed_data

In [4]:
perturbed_df = load_perturbed_data("../data/perturbed_data_point_42.csv")
data = pd.read_csv("../data/data_class.csv")
data = data.drop(columns=['SEQN', 'Unnamed: 0'])

data = data.rename(columns={
    'RIAGENDR': 'Gender',
    'RIDAGEYR': 'Age',
    'DMDHHSIZ': 'Household Size',
    'INDFMPIR': 'Income Poverty Ratio',
    'BMXBMI': 'Body Mass Index',
    'DSD010': 'Diet Question One',
    'DSD010AN': 'Diet Question Alternate',
    'SMD415': 'Smoking Status',
    'PAD590': 'Physical Activity One',
    'PAD600': 'Physical Activity Two',
    'HUQ010': 'Health Status',
    'restaurant': 'Restaurant Visits',
    'protein': 'Protein Intake',
    'healthy': 'Healthy Food Intake',
    'unhealthy': 'Unhealthy Food Intake',
    'beverage': 'Beverage Consumption',
    'milk': 'Milk Consumption',
    'MCQ010': 'Medical Condition One',
    'MCQ053': 'Medical Condition Two',
    'MCQ092': 'Medical Condition Three',
    'MCQ140': 'Medical Condition Four',
    'active': 'Physical Activity Status'
})

data

original_point = data.iloc[0]

In [5]:
# Convert both datasets to numeric types
perturbed_df = perturbed_df.apply(pd.to_numeric, errors='coerce')
original_point = original_point.apply(pd.to_numeric, errors='coerce')

In [6]:
data.dtypes

Gender                      float64
Age                         float64
Household Size              float64
Income Poverty Ratio        float64
Body Mass Index             float64
Diet Question One           float64
Diet Question Alternate     float64
Smoking Status              float64
Physical Activity One       float64
Physical Activity Two       float64
Health Status               float64
Restaurant Visits           float64
Protein Intake              float64
Healthy Food Intake         float64
Unhealthy Food Intake       float64
Beverage Consumption        float64
Milk Consumption            float64
Medical Condition One       float64
Medical Condition Two       float64
Medical Condition Three     float64
Medical Condition Four      float64
Physical Activity Status      int64
dtype: object

In [7]:
perturbed_df.dtypes

Gender                        int64
Age                           int64
Household Size                int64
Income Poverty Ratio          int64
Body Mass Index               int64
Diet Question One           float64
Diet Question Alternate     float64
Smoking Status                int64
Physical Activity One         int64
Physical Activity Two         int64
Health Status                 int64
Restaurant Visits             int64
Protein Intake                int64
Healthy Food Intake           int64
Unhealthy Food Intake         int64
Beverage Consumption          int64
Milk Consumption              int64
Medical Condition One       float64
Medical Condition Two         int64
Medical Condition Three       int64
Medical Condition Four      float64
Physical Activity Status      int64
dtype: object

In [9]:
# Ensure perturbed_df columns match the original_point keys
perturbed_df = perturbed_df[original_point.index]  # Align columns



# Add weights to the perturbed dataset
weighted_perturbed_df = add_weights_to_perturbed_data(original_point, perturbed_df)

# Save the weighted dataset (optional)
weighted_perturbed_df.to_csv("../data/weighted_perturbed_data_point_42.csv", index=False)

# Display the weighted DataFrame
weighted_perturbed_df.head()

Unnamed: 0,Gender,Age,Household Size,Income Poverty Ratio,Body Mass Index,Diet Question One,Diet Question Alternate,Smoking Status,Physical Activity One,Physical Activity Two,...,Healthy Food Intake,Unhealthy Food Intake,Beverage Consumption,Milk Consumption,Medical Condition One,Medical Condition Two,Medical Condition Three,Medical Condition Four,Physical Activity Status,Weight
0,0,19,2,1,24,0.0,0.0,1,4,3,...,73,108,3,10,0.0,0,0,0.0,1,4.04185e-28
1,0,19,2,1,24,0.0,0.0,1,4,3,...,73,108,3,10,0.0,0,0,0.0,0,1.486914e-28
2,0,19,2,1,24,0.0,0.0,1,4,3,...,73,108,3,10,0.0,0,0,0.0,5,7.931374e-35
3,0,19,2,1,24,0.0,0.0,1,4,3,...,73,108,3,10,0.0,0,0,1.0,1,1.486914e-28
4,0,19,2,1,24,0.0,0.0,1,4,3,...,73,108,3,10,0.0,0,0,1.0,0,5.470047e-29
