In [2]:
pip install pandas


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install torch transformers dataset

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import random
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Load the fine-tuned model and tokenizer
model_path = "/Users/923673423/lime/fine_tuned_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForMaskedLM.from_pretrained(model_path)

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [6]:
data = pd.read_csv('/Users/923673423/lime/data/data_class.csv')

data = data.rename(columns={
    'Unnamed: 0': 'Index',
    'SEQN': 'Sequence Number',
    'RIAGENDR': 'Gender',
    'RIDAGEYR': 'Age',
    'DMDHHSIZ': 'Household Size',
    'INDFMPIR': 'Income Poverty Ratio',
    'BMXBMI': 'Body Mass Index',
    'DSD010': 'Diet Question One',
    'DSD010AN': 'Diet Question Alternate',
    'SMD415': 'Smoking Status',
    'PAD590': 'Physical Activity One',
    'PAD600': 'Physical Activity Two',
    'HUQ010': 'Health Status',
    'restaurant': 'Restaurant Visits',
    'protein': 'Protein Intake',
    'healthy': 'Healthy Food Intake',
    'unhealthy': 'Unhealthy Food Intake',
    'beverage': 'Beverage Consumption',
    'milk': 'Milk Consumption',
    'MCQ010': 'Medical Condition One',
    'MCQ053': 'Medical Condition Two',
    'MCQ092': 'Medical Condition Three',
    'MCQ140': 'Medical Condition Four',
    'active': 'Physical Activity Status'
})


# Define a function to generate text descriptions for each row
def generate_text_descriptions(data):
    """
    Generate detailed text descriptions for each row in the dataset.

    Args:
        data (pd.DataFrame): The dataset to describe.

    Returns:
        pd.Series: A series of text descriptions for each row.
    """
    descriptions = []

    for _, row in data.iterrows():
        description = (
        f"The individual is {'male' if row['Gender'] == 1 else 'female'} and is {row['Age']} years old. "
        f"They live in a household with {row['Household Size']} members. Their income-to-poverty ratio is {row['Income Poverty Ratio']:.2f}, "
        f"which is {'below average' if row['Income Poverty Ratio'] < 1 else 'moderate' if 1 <= row['Income Poverty Ratio'] <= 3 else 'above average'}. "
        f"Their body mass index (BMI) is {row['Body Mass Index']:.1f}, calculated from their weight and height. This indicates they are "
        f"{'underweight' if row['Body Mass Index'] < 18.5 else 'in the normal range' if 18.5 <= row['Body Mass Index'] < 25 else 'overweight' if 25 <= row['Body Mass Index'] < 30 else 'obese'}. "
        f"They answered '{row['Diet Question One']}' to a question about their dietary habits, and '{row['Diet Question Alternate']}' to an alternate dietary question. "
        f"They currently {'do not smoke' if row['Smoking Status'] == 'No' else 'are smokers'}, and their physical activity includes {row['Physical Activity One']} minutes "
        f"of moderate-intensity activity and {row['Physical Activity Two']} minutes of vigorous-intensity activity weekly. "
        f"Their self-reported health status is {row['Health Status']} out of 5. "
        f"On average, they visit restaurants {row['Restaurant Visits']} times per month and consume {row['Protein Intake']} grams of protein daily. "
        f"Their healthy food intake is {row['Healthy Food Intake']} servings per day, compared to an unhealthy food intake of {row['Unhealthy Food Intake']} servings per day. "
        f"They drink {row['Beverage Consumption']} beverages daily and consume {row['Milk Consumption']} cups of milk daily. "
        f"Their reported medical conditions include: Condition One={row['Medical Condition One']}, Condition Two={row['Medical Condition Two']}, "
        f"Condition Three={row['Medical Condition Three']}, and Condition Four={row['Medical Condition Four']}. "
        f"Their overall physical activity status is {row['Physical Activity Status']}, which reflects their general lifestyle and fitness."
    )
        descriptions.append(description)

    return pd.Series(descriptions)

# Generate text descriptions for the dataset
data['Text_Description'] = generate_text_descriptions(data)

In [7]:
def generate_description_with_mask(row, mask_token="[MASK]", columns_to_mask=None):
    """
    Generate a description for a row with specified columns masked.

    Args:
        row (pd.Series): Row of data.
        mask_token (str): The token to use for masking.
        columns_to_mask (list): List of column names to mask.

    Returns:
        str: Description with specified values replaced by the mask token.
    """
    def format_value(value, fmt="{:.2f}"):
        if value == mask_token or isinstance(value, str):
            return value
        try:
            return fmt.format(float(value))
        except (ValueError, TypeError):
            return str(value)

    description = (
        f"The individual is {'male' if row['Gender'] == 1 else 'female'} and is "
        f"{mask_token if 'Age' in columns_to_mask else format_value(row['Age'], '{:.1f}')} years old. "
        f"They live in a household with "
        f"{mask_token if 'Household Size' in columns_to_mask else format_value(row['Household Size'], '{:.1f}')} members. "
        f"Their income-to-poverty ratio is {format_value(row['Income Poverty Ratio'], '{:.2f}')}, "
        f"which is {'below average' if row['Income Poverty Ratio'] < 1 else 'moderate' if 1 <= row['Income Poverty Ratio'] <= 3 else 'above average'}. "
        f"Their body mass index (BMI) is "
        f"{mask_token if 'Body Mass Index' in columns_to_mask else format_value(row['Body Mass Index'], '{:.1f}')}, "
        f"calculated from their weight and height. This indicates they are "
        f"{'underweight' if row['Body Mass Index'] < 18.5 else 'in the normal range' if 18.5 <= row['Body Mass Index'] < 25 else 'overweight' if 25 <= row['Body Mass Index'] < 30 else 'obese'}. "
        f"They answered '{mask_token if 'Diet Question One' in columns_to_mask else row['Diet Question One']}' "
        f"to a question about their dietary habits, and '{mask_token if 'Diet Question Alternate' in columns_to_mask else row['Diet Question Alternate']}' to an alternate dietary question. "
        f"They currently {'do not smoke' if row['Smoking Status'] == 'No' else mask_token if 'Smoking Status' in columns_to_mask else 'are smokers'}, "
        f"and their physical activity includes {mask_token if 'Physical Activity One' in columns_to_mask else format_value(row['Physical Activity One'], '{:.1f}')} minutes "
        f"of moderate-intensity activity and {format_value(row['Physical Activity Two'], '{:.1f}')} minutes of vigorous-intensity activity weekly. "
        f"Their self-reported health status is {format_value(row['Health Status'], '{:.1f}')} out of 5. "
        f"On average, they visit restaurants {mask_token if 'Restaurant Visits' in columns_to_mask else format_value(row['Restaurant Visits'], '{:.1f}')} times per month "
        f"and consume {mask_token if 'Protein Intake' in columns_to_mask else format_value(row['Protein Intake'], '{:.1f}')} grams of protein daily. "
        f"Their healthy food intake is {format_value(row['Healthy Food Intake'], '{:.1f}')} servings per day, compared to an unhealthy food intake of "
        f"{format_value(row['Unhealthy Food Intake'], '{:.1f}')} servings per day. "
        f"They drink {format_value(row['Beverage Consumption'], '{:.1f}')} beverages daily and consume {format_value(row['Milk Consumption'], '{:.1f}')} cups of milk daily. "
        f"Their reported medical conditions include: Condition One={row['Medical Condition One']}, Condition Two={row['Medical Condition Two']}, "
        f"Condition Three={row['Medical Condition Three']}, and Condition Four={row['Medical Condition Four']}. "
        f"Their overall physical activity status is {format_value(row['Physical Activity Status'], '{:.1f}')}, which reflects their general lifestyle and fitness."
    )
    return description

In [8]:
def extract_predictions(description, top_n=5):
    """
    Predict masked values in the description using the model.

    Args:
        description (str): The input description with masked values.
        top_n (int): Number of top predictions to consider.

    Returns:
        list: Predicted values for each mask.
    """
    inputs = tokenizer(description, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = outputs.logits

    mask_token_indices = torch.where(inputs.input_ids == tokenizer.mask_token_id)[1]
    all_predicted_values = []

    for mask_idx in mask_token_indices:
        predicted_token_ids = predictions[0, mask_idx].topk(top_n).indices.squeeze().tolist()
        predicted_tokens = [tokenizer.decode(token_id).strip() for token_id in predicted_token_ids]
        all_predicted_values.append(predicted_tokens)

    return all_predicted_values

In [9]:
def generate_perturbed_dataset(row, numeric_columns, num_samples=50, top_n_predictions=5):
    """
    Generate a perturbed dataset by masking and predicting specific columns.

    Args:
        row (pd.Series): Row of data.
        numeric_columns (list): List of numerical columns to perturb.
        num_samples (int): Number of perturbations to generate.
        top_n_predictions (int): Number of top predictions to consider for each mask.

    Returns:
        pd.DataFrame: Perturbed dataset with predictions.
    """
    perturbed_rows = []

    for _ in range(num_samples):
        # Randomly select columns to mask
        num_columns_to_mask = random.randint(1, len(numeric_columns))
        columns_to_mask = random.sample(numeric_columns, num_columns_to_mask)

        # Generate masked description
        description = generate_description_with_mask(row, columns_to_mask=columns_to_mask)
        all_predictions = extract_predictions(description, top_n=top_n_predictions)

        for predicted_combination in zip(*all_predictions):
            perturbed_row = row.copy()

            for col, value in zip(columns_to_mask, predicted_combination):
                try:
                    if col == 'Age':
                        perturbed_row[col] = max(0, round(float(value) + random.uniform(-5, 5)))
                    elif col == 'Household Size':
                        perturbed_row[col] = max(1, round(float(value) + random.uniform(-2, 2)))
                    elif col == 'Body Mass Index':
                        perturbed_row[col] = max(15, min(40, float(value) + random.uniform(-5, 5)))
                    elif col in ['Physical Activity One', 'Physical Activity Two']:
                        perturbed_row[col] = max(0, float(value) + random.uniform(-30, 30))
                    elif col in ['Protein Intake', 'Healthy Food Intake', 'Unhealthy Food Intake']:
                        perturbed_row[col] = max(0, float(value) + random.uniform(-20, 20))
                    else:
                        # Directly use predictions for categorical or other columns
                        perturbed_row[col] = value
                except (ValueError, TypeError):
                    # If invalid prediction, retain original value
                    perturbed_row[col] = row[col]

            # Add variability to unmasked numeric columns
            for col in numeric_columns:
                if col not in columns_to_mask:
                    try:
                        if col == 'Age':
                            perturbed_row[col] = max(0, round(float(perturbed_row[col]) + random.uniform(-2, 2)))
                        elif col == 'Household Size':
                            perturbed_row[col] = max(1, round(float(perturbed_row[col]) + random.uniform(-1, 1)))
                        elif col == 'Body Mass Index':
                            perturbed_row[col] = max(15, min(40, float(perturbed_row[col]) + random.uniform(-2, 2)))
                        elif col in ['Physical Activity One', 'Physical Activity Two']:
                            perturbed_row[col] = max(0, float(perturbed_row[col]) + random.uniform(-10, 10))
                        elif col in ['Protein Intake', 'Healthy Food Intake', 'Unhealthy Food Intake']:
                            perturbed_row[col] = max(0, float(perturbed_row[col]) + random.uniform(-10, 10))
                    except (ValueError, TypeError):
                        # Retain original value if perturbation fails
                        perturbed_row[col] = row[col]

            perturbed_rows.append(perturbed_row)

    return pd.DataFrame(perturbed_rows)

In [10]:
# Define numeric columns to perturb
numeric_columns = [
    'Age', 'Household Size', 'Body Mass Index', 'Physical Activity One',
    'Physical Activity Two', 'Protein Intake', 'Healthy Food Intake', 'Unhealthy Food Intake'
]

# Select a single data row
single_row = data.iloc[0]

# Generate the perturbed dataset
perturbed_dataset = generate_perturbed_dataset(single_row, numeric_columns, num_samples=50, top_n_predictions=5)


perturbed_dataset

Unnamed: 0,Index,Sequence Number,Gender,Age,Household Size,Income Poverty Ratio,Body Mass Index,Diet Question One,Diet Question Alternate,Smoking Status,...,Healthy Food Intake,Unhealthy Food Intake,Beverage Consumption,Milk Consumption,Medical Condition One,Medical Condition Two,Medical Condition Three,Medical Condition Four,Physical Activity Status,Text_Description
0,0,21005.0,1.0,19.0,5.0,2.44,50.85,0.0,0.0,0.0,...,37.957094,108.0,3.0,10.0,0.0,0.0,0.0,0.0,1,The individual is male and is 19.0 years old. ...
0,0,21005.0,1.0,19.0,4.0,2.44,50.85,0.0,0.0,0.0,...,27.172379,108.0,3.0,10.0,0.0,0.0,0.0,0.0,1,The individual is male and is 19.0 years old. ...
0,0,21005.0,1.0,19.0,6.0,2.44,50.85,0.0,0.0,0.0,...,51.611046,108.0,3.0,10.0,0.0,0.0,0.0,0.0,1,The individual is male and is 19.0 years old. ...
0,0,21005.0,1.0,19.0,2.0,2.44,50.85,0.0,0.0,0.0,...,44.399742,108.0,3.0,10.0,0.0,0.0,0.0,0.0,1,The individual is male and is 19.0 years old. ...
0,0,21005.0,1.0,19.0,3.0,2.44,50.85,0.0,0.0,0.0,...,19.365240,108.0,3.0,10.0,0.0,0.0,0.0,0.0,1,The individual is male and is 19.0 years old. ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0,21005.0,1.0,19.0,3.0,2.44,40.00,0.0,0.0,0.0,...,69.630510,108.0,3.0,10.0,0.0,0.0,0.0,0.0,1,The individual is male and is 19.0 years old. ...
0,0,21005.0,1.0,18.0,2.0,2.44,40.00,0.0,0.0,0.0,...,77.000990,108.0,3.0,10.0,0.0,0.0,0.0,0.0,1,The individual is male and is 19.0 years old. ...
0,0,21005.0,1.0,20.0,2.0,2.44,40.00,0.0,0.0,0.0,...,75.835551,108.0,3.0,10.0,0.0,0.0,0.0,0.0,1,The individual is male and is 19.0 years old. ...
0,0,21005.0,1.0,18.0,1.0,2.44,40.00,0.0,0.0,0.0,...,78.046418,108.0,3.0,10.0,0.0,0.0,0.0,0.0,1,The individual is male and is 19.0 years old. ...


In [11]:
perturbed_dataset['Gender'].nunique

<bound method IndexOpsMixin.nunique of 0    1.0
0    1.0
0    1.0
0    1.0
0    1.0
    ... 
0    1.0
0    1.0
0    1.0
0    1.0
0    1.0
Name: Gender, Length: 245, dtype: float64>

In [12]:
def extract_predictions2(description, top_n=5):
    """
    Predict masked values in the description using the model.

    Args:
        description (str): The input description with masked values.
        top_n (int): Number of top predictions to consider.

    Returns:
        list: Predicted values for each mask.
    """
    inputs = tokenizer(description, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = outputs.logits

    mask_token_indices = torch.where(inputs.input_ids == tokenizer.mask_token_id)[1]
    all_predicted_values = []

    for mask_idx in mask_token_indices:
        # Get top predictions for the current [MASK]
        predicted_token_ids = predictions[0, mask_idx].topk(top_n).indices.squeeze()

        # Ensure predicted_token_ids is a list
        if predicted_token_ids.dim() == 0:  # Single value
            predicted_token_ids = [predicted_token_ids.item()]
        else:  # Multiple values
            predicted_token_ids = predicted_token_ids.tolist()

        # Decode tokens
        predicted_tokens = [tokenizer.decode(token_id).strip() for token_id in predicted_token_ids]
        all_predicted_values.append(predicted_tokens)

    return all_predicted_values

In [13]:
def generate_relational_perturbed_dataset(row, num_samples=50, top_n_predictions=5):
    """
    Generate a perturbed dataset with relational variations and predictions.

    Args:
        row (pd.Series): Row of data.
        num_samples (int): Number of perturbations to generate.
        top_n_predictions (int): Number of top predictions to consider for each mask.

    Returns:
        pd.DataFrame: Relationally perturbed dataset with predictions.
    """
    # Columns to vary and predict based on relationships
    relational_groups = [
        {'vary': ['Age'], 'predict': ['Body Mass Index', 'Physical Activity One', 'Physical Activity Two']},
        {'vary': ['Body Mass Index'], 'predict': ['Diet Question One', 'Protein Intake', 'Unhealthy Food Intake']},
        {'vary': ['Physical Activity One', 'Physical Activity Two'], 'predict': ['Health Status', 'Healthy Food Intake']},
    ]
    fixed_columns = ['Gender', 'Medical Condition One', 'Medical Condition Two', 'Medical Condition Three', 'Medical Condition Four']

    perturbed_rows = []

    for _ in range(num_samples):
        # Randomly select a relational group
        group = random.choice(relational_groups)
        vary_columns = group['vary']
        predict_columns = group['predict']

        # Generate masked description
        description = generate_description_with_mask(row, columns_to_mask=vary_columns)
        all_predictions = extract_predictions2(description, top_n=top_n_predictions)

        for predicted_combination in zip(*all_predictions):
            perturbed_row = row.copy()

            # Apply variations to "vary" columns
            for col, value in zip(vary_columns, predicted_combination):
                try:
                    if col == 'Age':
                        perturbed_row[col] = max(0, round(float(value) + random.uniform(-5, 5)))
                    elif col == 'Body Mass Index':
                        perturbed_row[col] = max(15, min(40, float(value) + random.uniform(-5, 5)))
                    elif col in ['Physical Activity One', 'Physical Activity Two']:
                        perturbed_row[col] = max(0, float(value) + random.uniform(-30, 30))
                except (ValueError, TypeError):
                    perturbed_row[col] = row[col]  # Retain original if invalid

            # Predict "predict" columns based on "vary" changes
            predict_description = generate_description_with_mask(perturbed_row, columns_to_mask=predict_columns)
            predict_predictions = extract_predictions2(predict_description, top_n=1)

            for col, predicted_value in zip(predict_columns, predict_predictions):
                try:
                    if col in ['Health Status', 'Healthy Food Intake', 'Unhealthy Food Intake']:
                        perturbed_row[col] = max(0, float(predicted_value[0]))  # Use first prediction
                    else:
                        perturbed_row[col] = predicted_value[0]  # Use first prediction
                except (ValueError, TypeError):
                    perturbed_row[col] = row[col]  # Retain original if invalid

            # Ensure fixed columns remain unchanged
            for col in fixed_columns:
                perturbed_row[col] = row[col]

            perturbed_rows.append(perturbed_row)

    return pd.DataFrame(perturbed_rows)

# Example Usage
# Define a single data row
single_row = data.iloc[0]

# Generate the relationally perturbed dataset
perturbed_dataset = generate_relational_perturbed_dataset(single_row, num_samples=50, top_n_predictions=5)

# Display the perturbed dataset
perturbed_dataset

Unnamed: 0,Index,Sequence Number,Gender,Age,Household Size,Income Poverty Ratio,Body Mass Index,Diet Question One,Diet Question Alternate,Smoking Status,...,Healthy Food Intake,Unhealthy Food Intake,Beverage Consumption,Milk Consumption,Medical Condition One,Medical Condition Two,Medical Condition Three,Medical Condition Four,Physical Activity Status,Text_Description
0,0,21005.0,1.0,19.0,2.0,2.44,50.85,0.0,0.0,0.0,...,73.0,108.0,3.0,10.0,0.0,0.0,0.0,0.0,1,The individual is male and is 19.0 years old. ...
0,0,21005.0,1.0,19.0,2.0,2.44,50.85,0.0,0.0,0.0,...,73.0,108.0,3.0,10.0,0.0,0.0,0.0,0.0,1,The individual is male and is 19.0 years old. ...
0,0,21005.0,1.0,19.0,2.0,2.44,50.85,0.0,0.0,0.0,...,73.0,108.0,3.0,10.0,0.0,0.0,0.0,0.0,1,The individual is male and is 19.0 years old. ...
0,0,21005.0,1.0,19.0,2.0,2.44,50.85,0.0,0.0,0.0,...,73.0,108.0,3.0,10.0,0.0,0.0,0.0,0.0,1,The individual is male and is 19.0 years old. ...
0,0,21005.0,1.0,19.0,2.0,2.44,50.85,0.0,0.0,0.0,...,73.0,108.0,3.0,10.0,0.0,0.0,0.0,0.0,1,The individual is male and is 19.0 years old. ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0,21005.0,1.0,19.0,2.0,2.44,50.85,0.0,0.0,0.0,...,73.0,108.0,3.0,10.0,0.0,0.0,0.0,0.0,1,The individual is male and is 19.0 years old. ...
0,0,21005.0,1.0,19.0,2.0,2.44,50.85,0.0,0.0,0.0,...,73.0,108.0,3.0,10.0,0.0,0.0,0.0,0.0,1,The individual is male and is 19.0 years old. ...
0,0,21005.0,1.0,19.0,2.0,2.44,50.85,0.0,0.0,0.0,...,73.0,108.0,3.0,10.0,0.0,0.0,0.0,0.0,1,The individual is male and is 19.0 years old. ...
0,0,21005.0,1.0,19.0,2.0,2.44,50.85,0.0,0.0,0.0,...,73.0,108.0,3.0,10.0,0.0,0.0,0.0,0.0,1,The individual is male and is 19.0 years old. ...
