In [1]:
pip install pandas torch transformers datasets


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import random
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load the fine-tuned model and tokenizer
model_path = "/Users/923673423/lime/fine_tuned_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForMaskedLM.from_pretrained(model_path)

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [4]:
data = pd.read_csv('/Users/923673423/lime/data/data_class.csv')

data = data.rename(columns={
    'Unnamed: 0': 'Index',
    'SEQN': 'Sequence Number',
    'RIAGENDR': 'Gender',
    'RIDAGEYR': 'Age',
    'DMDHHSIZ': 'Household Size',
    'INDFMPIR': 'Income Poverty Ratio',
    'BMXBMI': 'Body Mass Index',
    'DSD010': 'Diet Question One',
    'DSD010AN': 'Diet Question Alternate',
    'SMD415': 'Smoking Status',
    'PAD590': 'Physical Activity One',
    'PAD600': 'Physical Activity Two',
    'HUQ010': 'Health Status',
    'restaurant': 'Restaurant Visits',
    'protein': 'Protein Intake',
    'healthy': 'Healthy Food Intake',
    'unhealthy': 'Unhealthy Food Intake',
    'beverage': 'Beverage Consumption',
    'milk': 'Milk Consumption',
    'MCQ010': 'Medical Condition One',
    'MCQ053': 'Medical Condition Two',
    'MCQ092': 'Medical Condition Three',
    'MCQ140': 'Medical Condition Four',
    'active': 'Physical Activity Status'
})

In [5]:
def generate_text_descriptions(data):
    """
    Generate detailed text descriptions for each row in the dataset.
    """
    descriptions = []

    for _, row in data.iterrows():
        try:
            description = (
                f"The individual is {'male' if row['Gender'] == 1 else 'female'} and is {row['Age']} years old. "
                f"They live in a household with {row['Household Size']} members. Their income-to-poverty ratio is {row['Income Poverty Ratio']:.2f}, "
                f"which is {'below average' if row['Income Poverty Ratio'] < 1 else 'moderate' if 1 <= row['Income Poverty Ratio'] <= 3 else 'above average'}. "
                f"Their body mass index (BMI) is {row['Body Mass Index']:.1f}, calculated from their weight and height. "
                f"This indicates they are {'underweight' if row['Body Mass Index'] < 18.5 else 'in the normal range' if 18.5 <= row['Body Mass Index'] < 25 else 'overweight' if 25 <= row['Body Mass Index'] < 30 else 'obese'}. "
                f"They answered '{row['Diet Question One']}' to a dietary question and '{row['Diet Question Alternate']}' as an alternate response. "
                f"They currently {'do not smoke' if row['Smoking Status'] == 'No' else 'are smokers'}. "
                f"Their physical activity includes {row['Physical Activity One']} minutes of moderate activity and {row['Physical Activity Two']} minutes of vigorous activity weekly. "
                f"Their self-reported health status is {row['Health Status']} out of 5. "
                f"They visit restaurants {row['Restaurant Visits']} times monthly and consume {row['Protein Intake']} grams of protein daily. "
                f"Medical conditions include: One={row['Medical Condition One']}, Two={row['Medical Condition Two']}, "
                f"Three={row['Medical Condition Three']}, Four={row['Medical Condition Four']}. "
                f"Their overall physical activity status is {row['Physical Activity Status']}."
            )
        except KeyError as e:
            print(f"Missing column in data: {e}")
            description = "Incomplete data."
        descriptions.append(description)

    return pd.Series(descriptions)


data['Text_Description'] = generate_text_descriptions(data)


In [31]:
def extract_predictions(description, top_n=5):
    """
    Predict masked values in the description using the model.

    Args:
        description (str): The input description with masked values.
        top_n (int): Number of top predictions to consider.

    Returns:
        list: Predicted values for each mask.
    """
    inputs = tokenizer(description, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = outputs.logits

    mask_token_indices = torch.where(inputs.input_ids == tokenizer.mask_token_id)[1]
    all_predicted_values = []

    for mask_idx in mask_token_indices:
        # Handle top_k properly to avoid issues with single predictions
        try:
            predicted_token_ids = predictions[0, mask_idx].topk(top_n).indices.squeeze().tolist()
        except ValueError:
            predicted_token_ids = [predictions[0, mask_idx].topk(1).indices.item()]  # Default to single prediction

        predicted_tokens = [tokenizer.decode(token_id).strip() for token_id in predicted_token_ids]
        all_predicted_values.append(predicted_tokens)

    return all_predicted_values

In [7]:
def generate_description_with_mask(row, mask_token="[MASK]", columns_to_mask=None):
    """
    Generate a description for a row with specified columns masked.
    """
    description = generate_text_descriptions(pd.DataFrame([row])).iloc[0]
    for col in columns_to_mask:
        if col in row:
            description = description.replace(str(row[col]), mask_token)
    return description

In [32]:
def generate_highly_dynamic_perturbations(row, numeric_columns, categorical_columns, num_samples=50, max_mask_columns=3):
    """
    Generate a highly dynamic and diverse set of perturbations for a single row.

    Args:
        row (pd.Series): A single row of the dataset.
        numeric_columns (list): List of numeric columns to perturb.
        categorical_columns (list): List of categorical columns to perturb.
        num_samples (int): Number of perturbations to generate.
        max_mask_columns (int): Maximum number of columns to mask in a single perturbation.

    Returns:
        pd.DataFrame: Dataset with highly dynamic and diverse perturbed rows.
    """
    perturbed_data = []

    for _ in range(num_samples):
        # Randomly select 1 to `max_mask_columns` columns to mask
        num_columns_to_mask = random.randint(1, max_mask_columns)
        columns_to_mask = random.sample(list(row.index), num_columns_to_mask)

        # Mask the selected columns
        masked_description = generate_description_with_mask(row, columns_to_mask=columns_to_mask)
        top_n_predictions = random.randint(1, 5)  # Randomize top-n predictions for more variety
        predictions = extract_predictions(masked_description, top_n=top_n_predictions)

        for predicted_combination in zip(*predictions):
            perturbed_row = row.copy()

            # Apply predictions directly to masked columns
            for col, value in zip(columns_to_mask, predicted_combination):
                if col in numeric_columns:
                    perturbed_row[col] = try_convert_to_numeric(value, default=row[col]) + random.uniform(-5, 5)
                elif col in categorical_columns:
                    perturbed_row[col] = value

            # Randomly tweak unmasked numeric columns to add slight variability
            for col in numeric_columns:
                if col not in columns_to_mask:
                    perturbed_row[col] += random.uniform(-2, 2)

            perturbed_data.append(perturbed_row)

    return pd.DataFrame(perturbed_data)


def try_convert_to_numeric(value, default):
    """
    Try to convert a value to numeric. If conversion fails, return the default value.

    Args:
        value (str): The value to convert.
        default: The default value to return if conversion fails.

    Returns:
        float or int: The converted value or the default.
    """
    try:
        return float(value)
    except ValueError:
        return default


# Example Usage
numeric_columns = [
    'Age', 'Household Size', 'Body Mass Index', 'Physical Activity One',
    'Physical Activity Two', 'Protein Intake', 'Healthy Food Intake', 'Unhealthy Food Intake'
]
categorical_columns = [
    'Gender', 'Smoking Status', 'Health Status', 'Diet Question One', 'Diet Question Alternate',
    'Medical Condition One', 'Medical Condition Two', 'Medical Condition Three', 'Medical Condition Four',
    'Physical Activity Status'
]

# Select a single row
single_row = data.iloc[0]

# Generate perturbed dataset for the single row
perturbed_dataset_dynamic = generate_highly_dynamic_perturbations(
    row=single_row,
    numeric_columns=numeric_columns,
    categorical_columns=categorical_columns,
    num_samples=50,
    max_mask_columns=3
)

# Display or save the perturbed dataset
perturbed_dataset_dynamic.head()

TypeError: 'int' object is not iterable