In [1]:
import pandas as pd
import itertools
import random
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline

In [2]:
data = pd.read_csv('../data/data_class.csv')
data

Unnamed: 0.1,Unnamed: 0,SEQN,RIAGENDR,RIDAGEYR,DMDHHSIZ,INDFMPIR,BMXBMI,DSD010,DSD010AN,SMD415,...,protein,healthy,unhealthy,beverage,milk,MCQ010,MCQ053,MCQ092,MCQ140,active
0,0,21005.0,1.0,19.0,2.0,2.44,50.85,0.0,0.0,0.0,...,82.0,73.0,108.0,3.0,10.0,0.0,0.0,0.0,0.0,1
1,1,21006.0,2.0,16.0,6.0,2.47,20.78,0.0,0.0,1.0,...,100.0,94.0,137.0,3.0,10.0,0.0,0.0,0.0,0.0,0
2,2,21007.0,2.0,14.0,5.0,1.60,18.43,0.0,0.0,1.0,...,62.0,92.0,95.0,4.0,14.0,0.0,0.0,0.0,0.0,0
3,3,21008.0,1.0,17.0,7.0,2.75,20.65,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
4,4,21009.0,1.0,55.0,2.0,3.79,31.26,0.0,0.0,0.0,...,87.0,111.0,121.0,7.0,13.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12563,12563,41469.0,1.0,19.0,2.0,0.66,19.25,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
12564,12564,41471.0,1.0,12.0,5.0,0.35,18.41,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
12565,12565,41472.0,1.0,34.0,2.0,3.66,26.23,1.0,0.0,0.0,...,90.0,85.0,133.0,17.0,19.0,0.0,0.0,0.0,0.0,0
12566,12566,41473.0,1.0,21.0,6.0,1.16,26.16,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [3]:
# Load the fine-tuned model and tokenizer
model_path = "../fine_tuned_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForMaskedLM.from_pretrained(model_path)

In [4]:
data = data.drop(columns=['SEQN', 'Unnamed: 0'])
data

Unnamed: 0,RIAGENDR,RIDAGEYR,DMDHHSIZ,INDFMPIR,BMXBMI,DSD010,DSD010AN,SMD415,PAD590,PAD600,...,protein,healthy,unhealthy,beverage,milk,MCQ010,MCQ053,MCQ092,MCQ140,active
0,1.0,19.0,2.0,2.44,50.85,0.0,0.0,0.0,4.0,3.0,...,82.0,73.0,108.0,3.0,10.0,0.0,0.0,0.0,0.0,1
1,2.0,16.0,6.0,2.47,20.78,0.0,0.0,1.0,5.0,1.0,...,100.0,94.0,137.0,3.0,10.0,0.0,0.0,0.0,0.0,0
2,2.0,14.0,5.0,1.60,18.43,0.0,0.0,1.0,4.0,2.0,...,62.0,92.0,95.0,4.0,14.0,0.0,0.0,0.0,0.0,0
3,1.0,17.0,7.0,2.75,20.65,0.0,0.0,1.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
4,1.0,55.0,2.0,3.79,31.26,0.0,0.0,0.0,2.0,3.0,...,87.0,111.0,121.0,7.0,13.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12563,1.0,19.0,2.0,0.66,19.25,0.0,0.0,0.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
12564,1.0,12.0,5.0,0.35,18.41,0.0,0.0,0.0,4.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
12565,1.0,34.0,2.0,3.66,26.23,1.0,0.0,0.0,5.0,2.0,...,90.0,85.0,133.0,17.0,19.0,0.0,0.0,0.0,0.0,0
12566,1.0,21.0,6.0,1.16,26.16,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [5]:
data = data.rename(columns={
    'RIAGENDR': 'Gender',
    'RIDAGEYR': 'Age',
    'DMDHHSIZ': 'Household Size',
    'INDFMPIR': 'Income Poverty Ratio',
    'BMXBMI': 'Body Mass Index',
    'DSD010': 'Diet Question One',
    'DSD010AN': 'Diet Question Alternate',
    'SMD415': 'Smoking Status',
    'PAD590': 'Physical Activity One',
    'PAD600': 'Physical Activity Two',
    'HUQ010': 'Health Status',
    'restaurant': 'Restaurant Visits',
    'protein': 'Protein Intake',
    'healthy': 'Healthy Food Intake',
    'unhealthy': 'Unhealthy Food Intake',
    'beverage': 'Beverage Consumption',
    'milk': 'Milk Consumption',
    'MCQ010': 'Medical Condition One',
    'MCQ053': 'Medical Condition Two',
    'MCQ092': 'Medical Condition Three',
    'MCQ140': 'Medical Condition Four',
    'active': 'Physical Activity Status'
})

data

Unnamed: 0,Gender,Age,Household Size,Income Poverty Ratio,Body Mass Index,Diet Question One,Diet Question Alternate,Smoking Status,Physical Activity One,Physical Activity Two,...,Protein Intake,Healthy Food Intake,Unhealthy Food Intake,Beverage Consumption,Milk Consumption,Medical Condition One,Medical Condition Two,Medical Condition Three,Medical Condition Four,Physical Activity Status
0,1.0,19.0,2.0,2.44,50.85,0.0,0.0,0.0,4.0,3.0,...,82.0,73.0,108.0,3.0,10.0,0.0,0.0,0.0,0.0,1
1,2.0,16.0,6.0,2.47,20.78,0.0,0.0,1.0,5.0,1.0,...,100.0,94.0,137.0,3.0,10.0,0.0,0.0,0.0,0.0,0
2,2.0,14.0,5.0,1.60,18.43,0.0,0.0,1.0,4.0,2.0,...,62.0,92.0,95.0,4.0,14.0,0.0,0.0,0.0,0.0,0
3,1.0,17.0,7.0,2.75,20.65,0.0,0.0,1.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
4,1.0,55.0,2.0,3.79,31.26,0.0,0.0,0.0,2.0,3.0,...,87.0,111.0,121.0,7.0,13.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12563,1.0,19.0,2.0,0.66,19.25,0.0,0.0,0.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
12564,1.0,12.0,5.0,0.35,18.41,0.0,0.0,0.0,4.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
12565,1.0,34.0,2.0,3.66,26.23,1.0,0.0,0.0,5.0,2.0,...,90.0,85.0,133.0,17.0,19.0,0.0,0.0,0.0,0.0,0
12566,1.0,21.0,6.0,1.16,26.16,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [6]:
# Define a function to generate text descriptions for each row
def generate_text_descriptions(data):
    """
    Generate detailed text descriptions for each row in the dataset.

    Args:
        data (pd.DataFrame): The dataset to describe.

    Returns:
        pd.Series: A series of text descriptions for each row.
    """
    descriptions = []

    for _, row in data.iterrows():
        description = (
        f"The individual is {'male' if row['Gender'] == 1 else 'female'} and is {row['Age']} years old. "
        f"They live in a household with {row['Household Size']} members. Their income-to-poverty ratio is {row['Income Poverty Ratio']:.2f}, "
        f"which is {'below average' if row['Income Poverty Ratio'] < 1 else 'moderate' if 1 <= row['Income Poverty Ratio'] <= 3 else 'above average'}. "
        f"Their body mass index (BMI) is {row['Body Mass Index']:.1f}, calculated from their weight and height. This indicates they are "
        f"{'underweight' if row['Body Mass Index'] < 18.5 else 'in the normal range' if 18.5 <= row['Body Mass Index'] < 25 else 'overweight' if 25 <= row['Body Mass Index'] < 30 else 'obese'}. "
        f"They answered '{row['Diet Question One']}' to a question about their dietary habits, and '{row['Diet Question Alternate']}' to an alternate dietary question. "
        f"They currently {'do not smoke' if row['Smoking Status'] == 'No' else 'are smokers'}, and their physical activity includes {row['Physical Activity One']} minutes "
        f"of moderate-intensity activity and {row['Physical Activity Two']} minutes of vigorous-intensity activity weekly. "
        f"Their self-reported health status is {row['Health Status']} out of 5. "
        f"On average, they visit restaurants {row['Restaurant Visits']} times per month and consume {row['Protein Intake']} grams of protein daily. "
        f"Their healthy food intake is {row['Healthy Food Intake']} servings per day, compared to an unhealthy food intake of {row['Unhealthy Food Intake']} servings per day. "
        f"They drink {row['Beverage Consumption']} beverages daily and consume {row['Milk Consumption']} cups of milk daily. "
        f"Their reported medical conditions include: Condition One={row['Medical Condition One']}, Condition Two={row['Medical Condition Two']}, "
        f"Condition Three={row['Medical Condition Three']}, and Condition Four={row['Medical Condition Four']}. "
        f"Their overall physical activity status is {row['Physical Activity Status']}, which reflects their general lifestyle and fitness."
    )
        descriptions.append(description)

    return pd.Series(descriptions)

# Generate text descriptions for the dataset
data['Text_Description'] = generate_text_descriptions(data)

In [47]:
data.columns

Index(['Gender', 'Age', 'Household Size', 'Income Poverty Ratio',
       'Body Mass Index', 'Diet Question One', 'Diet Question Alternate',
       'Smoking Status', 'Physical Activity One', 'Physical Activity Two',
       'Health Status', 'Restaurant Visits', 'Protein Intake',
       'Healthy Food Intake', 'Unhealthy Food Intake', 'Beverage Consumption',
       'Milk Consumption', 'Medical Condition One', 'Medical Condition Two',
       'Medical Condition Three', 'Medical Condition Four',
       'Physical Activity Status', 'Text_Description'],
      dtype='object')

In [33]:
from itertools import combinations
import random

def generate_many_masked_descriptions_for_row(row, maskable_columns, max_sentences=100, min_subset_size=2, max_subset_size=5):
    """
    Generate a large number of masked text descriptions for a single row by varying the subset size.

    Args:
        row (pd.Series): The row to describe.
        maskable_columns (list): List of columns that can be masked.
        max_sentences (int): Maximum number of sentences to generate.
        min_subset_size (int): Minimum number of columns to mask in each description.
        max_subset_size (int): Maximum number of columns to mask in each description.

    Returns:
        list: List of masked text descriptions for the row.
    """
    descriptions = []
    num_columns = len(maskable_columns)

    # Random sampling of subsets of columns to mask
    for _ in range(max_sentences):
        # Randomly choose a subset size
        subset_size = random.randint(min_subset_size, min(max_subset_size, num_columns))
        # Randomly sample the columns to mask
        mask_columns = random.sample(maskable_columns, subset_size)

        # Safely format numeric values
        def safe_format(value, fmt):
            try:
                return fmt.format(value)
            except (ValueError, TypeError):
                return str(value)

        # Generate a masked description
        description = (
            f"The individual is {'[MASK]' if 'Gender' in mask_columns else ('male' if row['Gender'] == 1 else 'female')} and is "
            f"{'[MASK]' if 'Age' in mask_columns else safe_format(row['Age'], '{:.0f}')} years old. "
            f"They live in a household with {'[MASK]' if 'Household Size' in mask_columns else safe_format(row['Household Size'], '{:.0f}')} members. "
            f"Their income-to-poverty ratio is {'[MASK]' if 'Income Poverty Ratio' in mask_columns else safe_format(row['Income Poverty Ratio'], '{:.2f}')}, "
            f"which is {'[MASK]' if 'Income Poverty Ratio' in mask_columns else 'below average' if row['Income Poverty Ratio'] < 1 else 'moderate' if 1 <= row['Income Poverty Ratio'] <= 3 else 'above average'}. "
            f"Their body mass index (BMI) is {'[MASK]' if 'Body Mass Index' in mask_columns else safe_format(row['Body Mass Index'], '{:.1f}')}, "
            f"calculated from their weight and height. This indicates they are "
            f"{'[MASK]' if 'Body Mass Index' in mask_columns else 'underweight' if row['Body Mass Index'] < 18.5 else 'in the normal range' if 18.5 <= row['Body Mass Index'] < 25 else 'overweight' if 25 <= row['Body Mass Index'] < 30 else 'obese'}. "
            f"They answered {'[MASK]' if 'Diet Question One' in mask_columns else row['Diet Question One']} to a question about their dietary habits, and "
            f"{'[MASK]' if 'Diet Question Alternate' in mask_columns else row['Diet Question Alternate']} to an alternate dietary question. "
            f"They currently {'[MASK]' if 'Smoking Status' in mask_columns else 'do not smoke' if row['Smoking Status'] == 'No' else 'are smokers'}, and their physical activity includes "
            f"{'[MASK]' if 'Physical Activity One' in mask_columns else safe_format(row['Physical Activity One'], '{:.0f}')} minutes of moderate-intensity activity and "
            f"{'[MASK]' if 'Physical Activity Two' in mask_columns else safe_format(row['Physical Activity Two'], '{:.0f}')} minutes of vigorous-intensity activity weekly. "
            f"Their self-reported health status is {'[MASK]' if 'Health Status' in mask_columns else safe_format(row['Health Status'], '{:.0f}')} out of 5. "
            f"On average, they visit restaurants {'[MASK]' if 'Restaurant Visits' in mask_columns else safe_format(row['Restaurant Visits'], '{:.0f}')} times per month and consume "
            f"{'[MASK]' if 'Protein Intake' in mask_columns else safe_format(row['Protein Intake'], '{:.0f}')} grams of protein daily. "
            f"Their healthy food intake is {'[MASK]' if 'Healthy Food Intake' in mask_columns else safe_format(row['Healthy Food Intake'], '{:.0f}')} servings per day, compared to an unhealthy food intake of "
            f"{'[MASK]' if 'Unhealthy Food Intake' in mask_columns else safe_format(row['Unhealthy Food Intake'], '{:.0f}')} servings per day. "
            f"They drink {'[MASK]' if 'Beverage Consumption' in mask_columns else safe_format(row['Beverage Consumption'], '{:.0f}')} beverages daily and consume "
            f"{'[MASK]' if 'Milk Consumption' in mask_columns else safe_format(row['Milk Consumption'], '{:.0f}')} cups of milk daily. "
            f"Their reported medical conditions include: Condition One={'[MASK]' if 'Medical Condition One' in mask_columns else row['Medical Condition One']}, "
            f"Condition Two={'[MASK]' if 'Medical Condition Two' in mask_columns else row['Medical Condition Two']}, "
            f"Condition Three={'[MASK]' if 'Medical Condition Three' in mask_columns else row['Medical Condition Three']}, and "
            f"Condition Four={'[MASK]' if 'Medical Condition Four' in mask_columns else row['Medical Condition Four']}. "
            f"Their overall physical activity status is {'[MASK]' if 'Physical Activity Status' in mask_columns else row['Physical Activity Status']}, "
            f"which reflects their general lifestyle and fitness."
        )

        # Add the description to the list
        descriptions.append(description)

    return descriptions

In [34]:
# Example: Generate many masked descriptions for a single row
sample_row = data.iloc[0]  # Replace with any row from your DataFrame
maskable_columns = data.columns.tolist()

# Generate a large number of masked descriptions
many_descriptions = generate_many_masked_descriptions_for_row(
    sample_row, maskable_columns, max_sentences=100, min_subset_size=5, max_subset_size=10
)

# View the generated descriptions
for desc in many_descriptions:  # Show the first 10 for brevity
    print(desc)
    print('-' * 50)

The individual is male and is 19 years old. They live in a household with 2 members. Their income-to-poverty ratio is [MASK], which is [MASK]. Their body mass index (BMI) is [MASK], calculated from their weight and height. This indicates they are [MASK]. They answered 0.0 to a question about their dietary habits, and 0.0 to an alternate dietary question. They currently are smokers, and their physical activity includes 4 minutes of moderate-intensity activity and 3 minutes of vigorous-intensity activity weekly. Their self-reported health status is [MASK] out of 5. On average, they visit restaurants 0 times per month and consume 82 grams of protein daily. Their healthy food intake is 73 servings per day, compared to an unhealthy food intake of 108 servings per day. They drink 3 beverages daily and consume 10 cups of milk daily. Their reported medical conditions include: Condition One=0.0, Condition Two=[MASK], Condition Three=[MASK], and Condition Four=[MASK]. Their overall physical acti

In [35]:
from itertools import product
import torch

def generate_predicted_sentences(masked_sentences, tokenizer, model, top_k=5):
    """
    Generate multiple predicted sentences for each masked sentence.

    Args:
        masked_sentences (list): List of masked text descriptions.
        tokenizer: Hugging Face tokenizer.
        model: Pre-trained masked language model.
        top_k (int): Number of top predictions to consider for each `[MASK]`.

    Returns:
        list: List of all predicted sentences.
    """
    all_predicted_sentences = []

    for masked_sentence in masked_sentences:
        # Tokenize the input
        inputs = tokenizer(masked_sentence, return_tensors="pt", truncation=True, padding=True)
        mask_token_index = torch.where(inputs.input_ids == tokenizer.mask_token_id)[1]

        # Predict the masked tokens
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits

        # Get the top-k predicted token IDs for each `[MASK]`
        all_predictions = []
        for token_idx in mask_token_index:
            mask_token_logits = logits[0, token_idx, :]
            top_token_ids = torch.topk(mask_token_logits, top_k, dim=0).indices
            top_tokens = tokenizer.convert_ids_to_tokens(top_token_ids)
            all_predictions.append(top_tokens)

        # Generate all combinations of sentences with the top predictions
        combinations = product(*all_predictions)  # Cartesian product of top predictions
        for combination in combinations:
            updated_sentence = masked_sentence
            for prediction in combination:
                updated_sentence = updated_sentence.replace("[MASK]", prediction, 1)
            all_predicted_sentences.append(updated_sentence)

    return all_predicted_sentences

In [36]:
# Select the first masked sentence (or any list of masked sentences)
masked_sentences = [many_descriptions[0]]  # Replace 0 with any index for other sentences

predicted_sentences = generate_predicted_sentences(masked_sentences, tokenizer, model, top_k=3)

# Output predicted sentences
predicted_sentences

['The individual is male and is 19 years old. They live in a household with 2 members. Their income-to-poverty ratio is 1, which is moderate. Their body mass index (BMI) is 24, calculated from their weight and height. This indicates they are ##weight. They answered 0.0 to a question about their dietary habits, and 0.0 to an alternate dietary question. They currently are smokers, and their physical activity includes 4 minutes of moderate-intensity activity and 3 minutes of vigorous-intensity activity weekly. Their self-reported health status is 1 out of 5. On average, they visit restaurants 0 times per month and consume 82 grams of protein daily. Their healthy food intake is 73 servings per day, compared to an unhealthy food intake of 108 servings per day. They drink 3 beverages daily and consume 10 cups of milk daily. Their reported medical conditions include: Condition One=0.0, Condition Two=0, Condition Three=0, and Condition Four=0. Their overall physical activity status is 1, which

In [37]:
def extract_original_values(predicted_sentences):
    """
    Extract values corresponding to the original columns in the dataset from the predicted sentences.

    Args:
        predicted_sentences (list): List of predicted sentences.

    Returns:
        pd.DataFrame: DataFrame with extracted values corresponding to the original columns.
    """
    import re
    import pandas as pd

    extracted_data = []

    # Regular expressions for each original column
    patterns = {
        "Gender": r"The individual is (\w+)",
        "Age": r"and is (\d+) years old",
        "Household Size": r"They live in a household with (\d+) members",
        "Income Poverty Ratio": r"ratio is ([\d.]+), which is",
        "Body Mass Index": r"\(BMI\) is ([\d.]+)",
        "Diet Question One": r"They answered ([\d.]+) to a question about their dietary habits",
        "Diet Question Alternate": r"and ([\d.]+) to an alternate dietary question",
        "Smoking Status": r"They currently (\w+(?: \w+)*),",
        "Physical Activity One": r"includes (\d+) minutes of moderate-intensity activity",
        "Physical Activity Two": r"and (\d+) minutes of vigorous-intensity activity",
        "Health Status": r"health status is (\d+) out of 5",
        "Restaurant Visits": r"they visit restaurants (\d+) times per month",
        "Protein Intake": r"consume (\d+) grams of protein daily",
        "Healthy Food Intake": r"Their healthy food intake is (\d+) servings",
        "Unhealthy Food Intake": r"unhealthy food intake of (\d+) servings",
        "Beverage Consumption": r"They drink (\d+) beverages daily",
        "Milk Consumption": r"consume (\d+) cups of milk daily",
        "Medical Condition One": r"Condition One=([\d.]+)",
        "Medical Condition Two": r"Condition Two=([\d.]+)",
        "Medical Condition Three": r"Condition Three=([\d.]+)",
        "Medical Condition Four": r"Condition Four=([\d.]+)",
        "Physical Activity Status": r"Their overall physical activity status is (\d+)"
    }

    # Process each predicted sentence
    for sentence in predicted_sentences:
        extracted_row = {}
        for key, pattern in patterns.items():
            match = re.search(pattern, sentence)
            extracted_row[key] = match.group(1) if match else None
        extracted_data.append(extracted_row)

    # Convert the list of dictionaries into a DataFrame
    return pd.DataFrame(extracted_data)

In [38]:
predicted_sentences

['The individual is male and is 19 years old. They live in a household with 2 members. Their income-to-poverty ratio is 1, which is moderate. Their body mass index (BMI) is 24, calculated from their weight and height. This indicates they are ##weight. They answered 0.0 to a question about their dietary habits, and 0.0 to an alternate dietary question. They currently are smokers, and their physical activity includes 4 minutes of moderate-intensity activity and 3 minutes of vigorous-intensity activity weekly. Their self-reported health status is 1 out of 5. On average, they visit restaurants 0 times per month and consume 82 grams of protein daily. Their healthy food intake is 73 servings per day, compared to an unhealthy food intake of 108 servings per day. They drink 3 beverages daily and consume 10 cups of milk daily. Their reported medical conditions include: Condition One=0.0, Condition Two=0, Condition Three=0, and Condition Four=0. Their overall physical activity status is 1, which

In [39]:
# Extract values and create DataFrame
df = extract_original_values(predicted_sentences)

# Display DataFrame
df

Unnamed: 0,Gender,Age,Household Size,Income Poverty Ratio,Body Mass Index,Diet Question One,Diet Question Alternate,Smoking Status,Physical Activity One,Physical Activity Two,...,Protein Intake,Healthy Food Intake,Unhealthy Food Intake,Beverage Consumption,Milk Consumption,Medical Condition One,Medical Condition Two,Medical Condition Three,Medical Condition Four,Physical Activity Status
0,male,19,2,1,24,0.0,0.0,are smokers,4,3,...,82,73,108,3,10,0.0,0,0,0.,1
1,male,19,2,1,24,0.0,0.0,are smokers,4,3,...,82,73,108,3,10,0.0,0,0,0.,0
2,male,19,2,1,24,0.0,0.0,are smokers,4,3,...,82,73,108,3,10,0.0,0,0,0.,5
3,male,19,2,1,24,0.0,0.0,are smokers,4,3,...,82,73,108,3,10,0.0,0,0,1.,1
4,male,19,2,1,24,0.0,0.0,are smokers,4,3,...,82,73,108,3,10,0.0,0,0,1.,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19678,male,19,2,5,23,0.0,0.0,are smokers,4,3,...,82,73,108,3,10,0.0,2,2,1.,0
19679,male,19,2,5,23,0.0,0.0,are smokers,4,3,...,82,73,108,3,10,0.0,2,2,1.,5
19680,male,19,2,5,23,0.0,0.0,are smokers,4,3,...,82,73,108,3,10,0.0,2,2,,1
19681,male,19,2,5,23,0.0,0.0,are smokers,4,3,...,82,73,108,3,10,0.0,2,2,,0


In [40]:
# Display unique values for each column in the DataFrame
for column in df.columns:
    unique_values = df[column].unique()
    print(f"Column: {column}")
    print(f"Unique values ({len(unique_values)}): {unique_values}\n")

Column: Gender
Unique values (1): ['male']

Column: Age
Unique values (1): ['19']

Column: Household Size
Unique values (1): ['2']

Column: Income Poverty Ratio
Unique values (3): ['1' '2' '5']

Column: Body Mass Index
Unique values (3): ['24' '25' '23']

Column: Diet Question One
Unique values (1): ['0.0']

Column: Diet Question Alternate
Unique values (1): ['0.0']

Column: Smoking Status
Unique values (1): ['are smokers']

Column: Physical Activity One
Unique values (1): ['4']

Column: Physical Activity Two
Unique values (1): ['3']

Column: Health Status
Unique values (3): ['1' '3' '2']

Column: Restaurant Visits
Unique values (1): ['0']

Column: Protein Intake
Unique values (1): ['82']

Column: Healthy Food Intake
Unique values (1): ['73']

Column: Unhealthy Food Intake
Unique values (1): ['108']

Column: Beverage Consumption
Unique values (1): ['3']

Column: Milk Consumption
Unique values (1): ['10']

Column: Medical Condition One
Unique values (1): ['0.0']

Column: Medical Conditi

In [68]:
def clean_and_standardize_data_v4(perturbed_data):
    """
    Clean and standardize data types in the perturbed dataset, handling close variations in values.

    Args:
        perturbed_data (pd.DataFrame): The perturbed dataset.

    Returns:
        pd.DataFrame: Cleaned and standardized dataset.
    """
    import numpy as np

    # Map for categorical values
    binary_mappings = {
        "Gender": {
            "male": 0, "female": 1, "femal": 1, "females": 1,
            "mal": 0, "males": 0
        },
        "Smoking Status": {
            "0": 0, "1": 1, 0: 0, 1: 1, "do not smoke": 0, "are smokers": 1
        },
        "Medical Condition One": {0: 0, 1: 1, "0.0": 0, "1.0": 1},
        "Medical Condition Two": {0: 0, 1: 1, "0.0": 0, "1.0": 1},
        "Medical Condition Three": {0: 0, 1: 1, "0.0": 0, "1.0": 1},
        "Medical Condition Four": {0: 0, 1: 1, "0.0": 0, "1.0": 1},
        "Physical Activity Status": {0: 0, 1: 1}
    }

    # Apply mappings to relevant columns
    for column, mapping in binary_mappings.items():
        if column in perturbed_data.columns:
            perturbed_data[column] = perturbed_data[column].replace(mapping)

    # Handle non-mapped values and convert numeric columns
    medical_columns = [
        "Medical Condition One", "Medical Condition Two",
        "Medical Condition Three", "Medical Condition Four"
    ]
    for column in medical_columns:
        if column in perturbed_data.columns:
            perturbed_data[column] = pd.to_numeric(perturbed_data[column], errors="coerce").fillna(0)

    # Convert other numeric columns
    numeric_columns = [
        "Age", "Household Size", "Income Poverty Ratio", "Body Mass Index",
        "Diet Question One", "Diet Question Alternate", "Physical Activity One",
        "Physical Activity Two", "Health Status", "Restaurant Visits",
        "Protein Intake", "Healthy Food Intake", "Unhealthy Food Intake",
        "Beverage Consumption", "Milk Consumption"
    ]
    for column in numeric_columns:
        if column in perturbed_data.columns:
            perturbed_data[column] = pd.to_numeric(perturbed_data[column], errors="coerce").fillna(0)

    # Ensure `Text_Description` remains text
    if "Text_Description" in perturbed_data.columns:
        perturbed_data["Text_Description"] = perturbed_data["Text_Description"].astype(str)

    # Return the cleaned dataset
    return perturbed_data

# Apply the updated cleaning function
cleaned_perturbed_df_v4 = clean_and_standardize_data_v4(df)

# Save the cleaned dataset
cleaned_perturbed_df_v4.to_csv("../data/cleaned_perturbed_data_point_final_v4.csv", index=False)

# Display sample rows
cleaned_perturbed_df.head()

Unnamed: 0,Gender,Age,Household Size,Income Poverty Ratio,Body Mass Index,Diet Question One,Diet Question Alternate,Smoking Status,Physical Activity One,Physical Activity Two,...,Protein Intake,Healthy Food Intake,Unhealthy Food Intake,Beverage Consumption,Milk Consumption,Medical Condition One,Medical Condition Two,Medical Condition Three,Medical Condition Four,Physical Activity Status
0,0,19,2,1,24,0.0,0.0,1,4,3,...,82,73,108,3,10,0.0,0,0,0.0,1
1,0,19,2,1,24,0.0,0.0,1,4,3,...,82,73,108,3,10,0.0,0,0,0.0,0
2,0,19,2,1,24,0.0,0.0,1,4,3,...,82,73,108,3,10,0.0,0,0,0.0,5
3,0,19,2,1,24,0.0,0.0,1,4,3,...,82,73,108,3,10,0.0,0,0,1.0,1
4,0,19,2,1,24,0.0,0.0,1,4,3,...,82,73,108,3,10,0.0,0,0,1.0,0


In [69]:
cleaned_perturbed_df.head()


Unnamed: 0,Gender,Age,Household Size,Income Poverty Ratio,Body Mass Index,Diet Question One,Diet Question Alternate,Smoking Status,Physical Activity One,Physical Activity Two,...,Protein Intake,Healthy Food Intake,Unhealthy Food Intake,Beverage Consumption,Milk Consumption,Medical Condition One,Medical Condition Two,Medical Condition Three,Medical Condition Four,Physical Activity Status
0,0,19,2,1,24,0.0,0.0,1,4,3,...,82,73,108,3,10,0.0,0,0,0.0,1
1,0,19,2,1,24,0.0,0.0,1,4,3,...,82,73,108,3,10,0.0,0,0,0.0,0
2,0,19,2,1,24,0.0,0.0,1,4,3,...,82,73,108,3,10,0.0,0,0,0.0,5
3,0,19,2,1,24,0.0,0.0,1,4,3,...,82,73,108,3,10,0.0,0,0,1.0,1
4,0,19,2,1,24,0.0,0.0,1,4,3,...,82,73,108,3,10,0.0,0,0,1.0,0


In [70]:
import os

def save_perturbed_data(df, original_index, save_dir="../data/", file_prefix="perturbed_data"):
    """
    Save the perturbed dataset to a specified directory with a unique name.

    Args:
        df (pd.DataFrame): The perturbed dataset.
        original_index (int): The index of the original data point being perturbed.
        save_dir (str): The directory where the file will be saved.
        file_prefix (str): Prefix for the file name.

    Returns:
        str: Path to the saved file.
    """
    # Ensure the save directory exists
    os.makedirs(save_dir, exist_ok=True)

    # Construct the file name using the prefix and original index
    file_name = f"{file_prefix}_point_{original_index}.csv"
    file_path = os.path.join(save_dir, file_name)

    # Save the DataFrame as a CSV file
    df.to_csv(file_path, index=False)

    print(f"Perturbed dataset saved to: {file_path}")
    return file_path

In [71]:
# Save the perturbed dataset
file_path = save_perturbed_data(cleaned_perturbed_df, original_index=42)

Perturbed dataset saved to: ../data/perturbed_data_point_42.csv


In [72]:
data.dtypes

Gender                      float64
Age                         float64
Household Size              float64
Income Poverty Ratio        float64
Body Mass Index             float64
Diet Question One           float64
Diet Question Alternate     float64
Smoking Status              float64
Physical Activity One       float64
Physical Activity Two       float64
Health Status               float64
Restaurant Visits           float64
Protein Intake              float64
Healthy Food Intake         float64
Unhealthy Food Intake       float64
Beverage Consumption        float64
Milk Consumption            float64
Medical Condition One       float64
Medical Condition Two       float64
Medical Condition Three     float64
Medical Condition Four      float64
Physical Activity Status      int64
Text_Description             object
dtype: object

In [73]:
df.dtypes

Gender                        int64
Age                           int64
Household Size                int64
Income Poverty Ratio          int64
Body Mass Index               int64
Diet Question One           float64
Diet Question Alternate     float64
Smoking Status                int64
Physical Activity One         int64
Physical Activity Two         int64
Health Status                 int64
Restaurant Visits             int64
Protein Intake                int64
Healthy Food Intake           int64
Unhealthy Food Intake         int64
Beverage Consumption          int64
Milk Consumption              int64
Medical Condition One       float64
Medical Condition Two         int64
Medical Condition Three       int64
Medical Condition Four      float64
Physical Activity Status     object
dtype: object