In [1]:
pip install pandas

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install torch transformers dataset

Defaulting to user installation because normal site-packages is not writeable
Collecting dataset
  Downloading dataset-1.6.2-py2.py3-none-any.whl (18 kB)
Collecting sqlalchemy<2.0.0,>=1.3.2
  Downloading SQLAlchemy-1.4.54-cp39-cp39-manylinux1_x86_64.manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_5_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 27.2 MB/s eta 0:00:01
[?25hCollecting alembic>=0.6.2
  Downloading alembic-1.14.0-py3-none-any.whl (233 kB)
[K     |████████████████████████████████| 233 kB 99.5 MB/s eta 0:00:01
[?25hCollecting banal>=1.0.1
  Downloading banal-1.0.6-py2.py3-none-any.whl (6.1 kB)
Collecting greenlet!=0.4.17; python_version >= "3" and (platform_machine == "aarch64" or (platform_machine == "ppc64le" or (platform_machine == "x86_64" or (platform_machine == "amd64" or (platform_machine == "AMD64" or (platform_machine == "win32" or platform_machine == "WIN32"))))))
  Downloading greenlet-3.1.

In [2]:
import pandas as pd
import random
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load the fine-tuned model and tokenizer
model_path = "/Users/923673423/lime/fine_tuned_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForMaskedLM.from_pretrained(model_path)

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [4]:
data = pd.read_csv('/Users/923673423/lime/data/data_class.csv')

data = data.rename(columns={
    'Unnamed: 0': 'Index',
    'SEQN': 'Sequence Number',
    'RIAGENDR': 'Gender',
    'RIDAGEYR': 'Age',
    'DMDHHSIZ': 'Household Size',
    'INDFMPIR': 'Income Poverty Ratio',
    'BMXBMI': 'Body Mass Index',
    'DSD010': 'Diet Question One',
    'DSD010AN': 'Diet Question Alternate',
    'SMD415': 'Smoking Status',
    'PAD590': 'Physical Activity One',
    'PAD600': 'Physical Activity Two',
    'HUQ010': 'Health Status',
    'restaurant': 'Restaurant Visits',
    'protein': 'Protein Intake',
    'healthy': 'Healthy Food Intake',
    'unhealthy': 'Unhealthy Food Intake',
    'beverage': 'Beverage Consumption',
    'milk': 'Milk Consumption',
    'MCQ010': 'Medical Condition One',
    'MCQ053': 'Medical Condition Two',
    'MCQ092': 'Medical Condition Three',
    'MCQ140': 'Medical Condition Four',
    'active': 'Physical Activity Status'
})


# Define a function to generate text descriptions for each row
def generate_text_descriptions(data):
    """
    Generate detailed text descriptions for each row in the dataset.

    Args:
        data (pd.DataFrame): The dataset to describe.

    Returns:
        pd.Series: A series of text descriptions for each row.
    """
    descriptions = []

    for _, row in data.iterrows():
        description = (
        f"The individual is {'male' if row['Gender'] == 1 else 'female'} and is {row['Age']} years old. "
        f"They live in a household with {row['Household Size']} members. Their income-to-poverty ratio is {row['Income Poverty Ratio']:.2f}, "
        f"which is {'below average' if row['Income Poverty Ratio'] < 1 else 'moderate' if 1 <= row['Income Poverty Ratio'] <= 3 else 'above average'}. "
        f"Their body mass index (BMI) is {row['Body Mass Index']:.1f}, calculated from their weight and height. This indicates they are "
        f"{'underweight' if row['Body Mass Index'] < 18.5 else 'in the normal range' if 18.5 <= row['Body Mass Index'] < 25 else 'overweight' if 25 <= row['Body Mass Index'] < 30 else 'obese'}. "
        f"They answered '{row['Diet Question One']}' to a question about their dietary habits, and '{row['Diet Question Alternate']}' to an alternate dietary question. "
        f"They currently {'do not smoke' if row['Smoking Status'] == 'No' else 'are smokers'}, and their physical activity includes {row['Physical Activity One']} minutes "
        f"of moderate-intensity activity and {row['Physical Activity Two']} minutes of vigorous-intensity activity weekly. "
        f"Their self-reported health status is {row['Health Status']} out of 5. "
        f"On average, they visit restaurants {row['Restaurant Visits']} times per month and consume {row['Protein Intake']} grams of protein daily. "
        f"Their healthy food intake is {row['Healthy Food Intake']} servings per day, compared to an unhealthy food intake of {row['Unhealthy Food Intake']} servings per day. "
        f"They drink {row['Beverage Consumption']} beverages daily and consume {row['Milk Consumption']} cups of milk daily. "
        f"Their reported medical conditions include: Condition One={row['Medical Condition One']}, Condition Two={row['Medical Condition Two']}, "
        f"Condition Three={row['Medical Condition Three']}, and Condition Four={row['Medical Condition Four']}. "
        f"Their overall physical activity status is {row['Physical Activity Status']}, which reflects their general lifestyle and fitness."
    )
        descriptions.append(description)

    return pd.Series(descriptions)

# Generate text descriptions for the dataset
data['Text_Description'] = generate_text_descriptions(data)

In [9]:
def generate_description_with_mask(row, mask_token="[MASK]", columns_to_mask=None):
    """
    Generate a description for a row with specified columns masked.

    Args:
        row (pd.Series): Row of data.
        mask_token (str): The token to use for masking.
        columns_to_mask (list): List of column names to mask.

    Returns:
        str: Description with specified values replaced by the mask token.
    """
    def format_value(value, fmt="{:.2f}"):
        """
        Safely format a value as a float or return as is if masking or not numeric.
        """
        if value == mask_token or isinstance(value, str):
            return value
        try:
            return fmt.format(float(value))
        except (ValueError, TypeError):
            return str(value)

    description = (
        f"The individual is {'male' if row['Gender'] == 1 else 'female'} and is "
        f"{mask_token if 'Age' in columns_to_mask else format_value(row['Age'], '{:.1f}')} years old. "
        f"They live in a household with "
        f"{mask_token if 'Household Size' in columns_to_mask else format_value(row['Household Size'], '{:.1f}')} members. "
        f"Their income-to-poverty ratio is "
        f"{format_value(row['Income Poverty Ratio'], '{:.2f}')}, "
        f"which is {'below average' if row['Income Poverty Ratio'] < 1 else 'moderate' if 1 <= row['Income Poverty Ratio'] <= 3 else 'above average'}. "
        f"Their body mass index (BMI) is "
        f"{mask_token if 'Body Mass Index' in columns_to_mask else format_value(row['Body Mass Index'], '{:.1f}')}, "
        f"calculated from their weight and height. This indicates they are "
        f"{'underweight' if row['Body Mass Index'] < 18.5 else 'in the normal range' if 18.5 <= row['Body Mass Index'] < 25 else 'overweight' if 25 <= row['Body Mass Index'] < 30 else 'obese'}. "
        f"They answered '{row['Diet Question One']}' to a question about their dietary habits, and '{row['Diet Question Alternate']}' to an alternate dietary question. "
        f"They currently {'do not smoke' if row['Smoking Status'] == 'No' else 'are smokers'}, and their physical activity includes "
        f"{mask_token if 'Physical Activity One' in columns_to_mask else format_value(row['Physical Activity One'], '{:.1f}')} minutes "
        f"of moderate-intensity activity and {format_value(row['Physical Activity Two'], '{:.1f}')} minutes of vigorous-intensity activity weekly. "
        f"Their self-reported health status is {format_value(row['Health Status'], '{:.1f}')} out of 5. "
        f"On average, they visit restaurants {format_value(row['Restaurant Visits'], '{:.1f}')} times per month and consume "
        f"{mask_token if 'Protein Intake' in columns_to_mask else format_value(row['Protein Intake'], '{:.1f}')} grams of protein daily. "
        f"Their healthy food intake is {format_value(row['Healthy Food Intake'], '{:.1f}')} servings per day, compared to an unhealthy food intake of "
        f"{format_value(row['Unhealthy Food Intake'], '{:.1f}')} servings per day. "
        f"They drink {format_value(row['Beverage Consumption'], '{:.1f}')} beverages daily and consume {format_value(row['Milk Consumption'], '{:.1f}')} cups of milk daily. "
        f"Their reported medical conditions include: Condition One={row['Medical Condition One']}, Condition Two={row['Medical Condition Two']}, "
        f"Condition Three={row['Medical Condition Three']}, and Condition Four={row['Medical Condition Four']}. "
        f"Their overall physical activity status is {format_value(row['Physical Activity Status'], '{:.1f}')}, which reflects their general lifestyle and fitness."
    )
    return description

In [13]:
def extract_predictions(description):
    """
    Predict masked values in the description using the model.

    Args:
        description (str): The input description with masked values.

    Returns:
        list: Predicted values for each mask.
    """
    # Tokenize the description
    inputs = tokenizer(description, return_tensors="pt")
    
    # Generate predictions
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = outputs.logits

    # Extract predictions for [MASK] tokens
    mask_token_index = torch.where(inputs.input_ids == tokenizer.mask_token_id)[1]
    predicted_token_ids = predictions[0, mask_token_index].topk(1).indices.squeeze()

    # Ensure predicted_token_ids is always a list
    if mask_token_index.shape[0] == 1:  # Single MASK token
        predicted_token_ids = [predicted_token_ids.item()]
    else:  # Multiple MASK tokens
        predicted_token_ids = predicted_token_ids.tolist()

    # Decode tokens to get predicted values
    predicted_tokens = [tokenizer.decode(token_id).strip() for token_id in predicted_token_ids]
    
    return predicted_tokens

In [14]:
def generate_perturbed_dataset(row, numeric_columns, num_samples=10):
    """
    Generate a perturbed dataset by masking and predicting specific columns.

    Args:
        row (pd.Series): Row of data.
        numeric_columns (list): List of numerical columns to perturb.
        num_samples (int): Number of perturbations to generate.

    Returns:
        pd.DataFrame: Perturbed dataset with predictions.
    """
    perturbed_rows = []

    for _ in range(num_samples):
        # Randomly select columns to mask
        num_columns_to_mask = random.randint(1, len(numeric_columns))
        columns_to_mask = random.sample(numeric_columns, num_columns_to_mask)

        # Generate a masked description
        description = generate_description_with_mask(row, columns_to_mask=columns_to_mask)

        # Predict the masked values
        predicted_values = extract_predictions(description)

        # Create a new perturbed row
        perturbed_row = row.copy()
        for col, value in zip(columns_to_mask, predicted_values):
            perturbed_row[col] = value

        # Append the perturbed row to the dataset
        perturbed_rows.append(perturbed_row)

    return pd.DataFrame(perturbed_rows)

In [15]:
# Define the list of numerical columns to perturb
numeric_columns = ['Age', 'Household Size', 'Body Mass Index', 'Physical Activity One', 'Protein Intake']

# Select a single data row
single_row = data.iloc[0]

# Generate the perturbed dataset
perturbed_dataset = generate_perturbed_dataset(single_row, numeric_columns, num_samples=10)

# Display the perturbed dataset
perturbed_dataset

Unnamed: 0,Index,Sequence Number,Gender,Age,Household Size,Income Poverty Ratio,Body Mass Index,Diet Question One,Diet Question Alternate,Smoking Status,...,Healthy Food Intake,Unhealthy Food Intake,Beverage Consumption,Milk Consumption,Medical Condition One,Medical Condition Two,Medical Condition Three,Medical Condition Four,Physical Activity Status,Text_Description
0,0,21005.0,1.0,30.0,6.0,2.44,2.0,0.0,0.0,0.0,...,73.0,108.0,3.0,10.0,0.0,0.0,0.0,0.0,1,The individual is male and is 19.0 years old. ...
0,0,21005.0,1.0,19.0,2.0,2.44,30.0,0.0,0.0,0.0,...,73.0,108.0,3.0,10.0,0.0,0.0,0.0,0.0,1,The individual is male and is 19.0 years old. ...
0,0,21005.0,1.0,6.0,2.0,2.44,0.0,0.0,0.0,0.0,...,73.0,108.0,3.0,10.0,0.0,0.0,0.0,0.0,1,The individual is male and is 19.0 years old. ...
0,0,21005.0,1.0,80.0,2.0,2.44,50.85,0.0,0.0,0.0,...,73.0,108.0,3.0,10.0,0.0,0.0,0.0,0.0,1,The individual is male and is 19.0 years old. ...
0,0,21005.0,1.0,2.0,0.0,2.44,30.0,0.0,0.0,0.0,...,73.0,108.0,3.0,10.0,0.0,0.0,0.0,0.0,1,The individual is male and is 19.0 years old. ...
0,0,21005.0,1.0,19.0,2.0,2.44,30.0,0.0,0.0,0.0,...,73.0,108.0,3.0,10.0,0.0,0.0,0.0,0.0,1,The individual is male and is 19.0 years old. ...
0,0,21005.0,1.0,45.0,6.0,2.44,50.85,0.0,0.0,0.0,...,73.0,108.0,3.0,10.0,0.0,0.0,0.0,0.0,1,The individual is male and is 19.0 years old. ...
0,0,21005.0,1.0,19.0,2.0,2.44,50.85,0.0,0.0,0.0,...,73.0,108.0,3.0,10.0,0.0,0.0,0.0,0.0,1,The individual is male and is 19.0 years old. ...
0,0,21005.0,1.0,19.0,30.0,2.44,2.0,0.0,0.0,0.0,...,73.0,108.0,3.0,10.0,0.0,0.0,0.0,0.0,1,The individual is male and is 19.0 years old. ...
0,0,21005.0,1.0,0.0,2.0,2.44,6.0,0.0,0.0,0.0,...,73.0,108.0,3.0,10.0,0.0,0.0,0.0,0.0,1,The individual is male and is 19.0 years old. ...
