In [7]:
pip install pandas


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [8]:
pip install datasets transformers

Defaulting to user installation because normal site-packages is not writeable
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[K     |████████████████████████████████| 480 kB 20.7 MB/s eta 0:00:01
[?25hCollecting transformers
  Downloading transformers-4.47.0-py3-none-any.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 39.6 MB/s eta 0:00:01
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (737 kB)
[K     |████████████████████████████████| 737 kB 9.3 MB/s eta 0:00:01
[?25hCollecting fsspec[http]<=2024.9.0,>=2023.1.0
  Downloading fsspec-2024.9.0-py3-none-any.whl (179 kB)
[K     |████████████████████████████████| 179 kB 68.7 MB/s eta 0:00:01
[?25hCollecting pyarrow>=15.0.0
  Downloading pyarrow-18.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (40.1 MB)
[K     |████████████████████████████████| 40.1 MB 17.6 MB/s eta 0:00:01
Collecting aiohttp
  Downloading aioh

In [10]:
pip install torch torchvision 

Defaulting to user installation because normal site-packages is not writeable
Collecting torch
  Downloading torch-2.5.1-cp39-cp39-manylinux1_x86_64.whl (906.5 MB)
[K     |████████████████████████████████| 906.5 MB 15 kB/s /s eta 0:00:01
[?25hCollecting torchvision
  Downloading torchvision-0.20.1-cp39-cp39-manylinux1_x86_64.whl (7.2 MB)
[K     |████████████████████████████████| 7.2 MB 70.2 MB/s eta 0:00:01
[?25hCollecting nvidia-cuda-cupti-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64"
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (13.8 MB)
[K     |████████████████████████████████| 13.8 MB 192.9 MB/s eta 0:00:01
[?25hCollecting nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64"
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl (363.4 MB)
[K     |████████████████████████████████| 363.4 MB 7.6 kB/s s eta 0:00:01
Collecting nvidia-nvjitlink-cu12==12.4.127

In [34]:
import pandas as pd 
from transformers import AutoTokenizer, AutoModelForMaskedLM

# Load the fine-tuned model and tokenizer
model_path = "/Users/923673423/lime/fine_tuned_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForMaskedLM.from_pretrained(model_path)

In [35]:
# Define a test text with a [MASK] token
test_text = (
    "The individual is 20 years old and lives in a household with [MASK] members. "
    "Their body mass index (BMI) is 24.5, which indicates they are in the normal range."
)

In [36]:
import torch
# Tokenize the test text
inputs = tokenizer(test_text, return_tensors="pt")

# Get model predictions
with torch.no_grad():
    outputs = model(**inputs)
    predictions = outputs.logits

In [37]:
# Find the index of the [MASK] token
mask_token_index = torch.where(inputs.input_ids == tokenizer.mask_token_id)[1]

# Get the top 5 predictions for the [MASK] token
predicted_token_ids = predictions[0, mask_token_index].topk(5).indices.squeeze().tolist()
predicted_tokens = [tokenizer.decode(token_id).strip() for token_id in predicted_token_ids]

# Display results
print("Input text:", test_text)
print("Predicted tokens for [MASK]:", predicted_tokens)

Input text: The individual is 20 years old and lives in a household with [MASK] members. Their body mass index (BMI) is 24.5, which indicates they are in the normal range.
Predicted tokens for [MASK]: ['7', '2', '5', '4', '1']


In [38]:
data = pd.read_csv('/Users/923673423/lime/data/data_class.csv')

data = data.rename(columns={
    'Unnamed: 0': 'Index',
    'SEQN': 'Sequence Number',
    'RIAGENDR': 'Gender',
    'RIDAGEYR': 'Age',
    'DMDHHSIZ': 'Household Size',
    'INDFMPIR': 'Income Poverty Ratio',
    'BMXBMI': 'Body Mass Index',
    'DSD010': 'Diet Question One',
    'DSD010AN': 'Diet Question Alternate',
    'SMD415': 'Smoking Status',
    'PAD590': 'Physical Activity One',
    'PAD600': 'Physical Activity Two',
    'HUQ010': 'Health Status',
    'restaurant': 'Restaurant Visits',
    'protein': 'Protein Intake',
    'healthy': 'Healthy Food Intake',
    'unhealthy': 'Unhealthy Food Intake',
    'beverage': 'Beverage Consumption',
    'milk': 'Milk Consumption',
    'MCQ010': 'Medical Condition One',
    'MCQ053': 'Medical Condition Two',
    'MCQ092': 'Medical Condition Three',
    'MCQ140': 'Medical Condition Four',
    'active': 'Physical Activity Status'
})


# Define a function to generate text descriptions for each row
def generate_text_descriptions(data):
    """
    Generate detailed text descriptions for each row in the dataset.

    Args:
        data (pd.DataFrame): The dataset to describe.

    Returns:
        pd.Series: A series of text descriptions for each row.
    """
    descriptions = []

    for _, row in data.iterrows():
        description = (
        f"The individual is {'male' if row['Gender'] == 1 else 'female'} and is {row['Age']} years old. "
        f"They live in a household with {row['Household Size']} members. Their income-to-poverty ratio is {row['Income Poverty Ratio']:.2f}, "
        f"which is {'below average' if row['Income Poverty Ratio'] < 1 else 'moderate' if 1 <= row['Income Poverty Ratio'] <= 3 else 'above average'}. "
        f"Their body mass index (BMI) is {row['Body Mass Index']:.1f}, calculated from their weight and height. This indicates they are "
        f"{'underweight' if row['Body Mass Index'] < 18.5 else 'in the normal range' if 18.5 <= row['Body Mass Index'] < 25 else 'overweight' if 25 <= row['Body Mass Index'] < 30 else 'obese'}. "
        f"They answered '{row['Diet Question One']}' to a question about their dietary habits, and '{row['Diet Question Alternate']}' to an alternate dietary question. "
        f"They currently {'do not smoke' if row['Smoking Status'] == 'No' else 'are smokers'}, and their physical activity includes {row['Physical Activity One']} minutes "
        f"of moderate-intensity activity and {row['Physical Activity Two']} minutes of vigorous-intensity activity weekly. "
        f"Their self-reported health status is {row['Health Status']} out of 5. "
        f"On average, they visit restaurants {row['Restaurant Visits']} times per month and consume {row['Protein Intake']} grams of protein daily. "
        f"Their healthy food intake is {row['Healthy Food Intake']} servings per day, compared to an unhealthy food intake of {row['Unhealthy Food Intake']} servings per day. "
        f"They drink {row['Beverage Consumption']} beverages daily and consume {row['Milk Consumption']} cups of milk daily. "
        f"Their reported medical conditions include: Condition One={row['Medical Condition One']}, Condition Two={row['Medical Condition Two']}, "
        f"Condition Three={row['Medical Condition Three']}, and Condition Four={row['Medical Condition Four']}. "
        f"Their overall physical activity status is {row['Physical Activity Status']}, which reflects their general lifestyle and fitness."
    )
        descriptions.append(description)

    return pd.Series(descriptions)

# Generate text descriptions for the dataset
data['Text_Description'] = generate_text_descriptions(data)

In [39]:
data['Text_Description'][0]

"The individual is male and is 19.0 years old. They live in a household with 2.0 members. Their income-to-poverty ratio is 2.44, which is moderate. Their body mass index (BMI) is 50.9, calculated from their weight and height. This indicates they are obese. They answered '0.0' to a question about their dietary habits, and '0.0' to an alternate dietary question. They currently are smokers, and their physical activity includes 4.0 minutes of moderate-intensity activity and 3.0 minutes of vigorous-intensity activity weekly. Their self-reported health status is 3.0 out of 5. On average, they visit restaurants 0.0 times per month and consume 82.0 grams of protein daily. Their healthy food intake is 73.0 servings per day, compared to an unhealthy food intake of 108.0 servings per day. They drink 3.0 beverages daily and consume 10.0 cups of milk daily. Their reported medical conditions include: Condition One=0.0, Condition Two=0.0, Condition Three=0.0, and Condition Four=0.0. Their overall phy

In [40]:
test_row = "The individual is male and is 19.0 years old. They live in a household with 2.0 members. Their income-to-poverty ratio is 2.44, which is moderate. Their body mass index (BMI) is [MASK], calculated from their weight and height. This indicates they are obese. They answered '0.0' to a question about their dietary habits, and '0.0' to an alternate dietary question. They currently are smokers, and their physical activity includes 4.0 minutes of moderate-intensity activity and 3.0 minutes of vigorous-intensity activity weekly. Their self-reported health status is 3.0 out of 5. On average, they visit restaurants 0.0 times per month and consume 82.0 grams of protein daily. Their healthy food intake is 73.0 servings per day, compared to an unhealthy food intake of 108.0 servings per day. They drink 3.0 beverages daily and consume 10.0 cups of milk daily. Their reported medical conditions include: Condition One=0.0, Condition Two=0.0, Condition Three=0.0, and Condition Four=0.0. Their overall physical activity status is 1.0, which reflects their general lifestyle and fitness."

In [41]:
import torch
# Tokenize the test text
inputs = tokenizer(test_row, return_tensors="pt")

# Get model predictions
with torch.no_grad():
    outputs = model(**inputs)
    predictions = outputs.logits

In [42]:
# Find the index of the [MASK] token
mask_token_index = torch.where(inputs.input_ids == tokenizer.mask_token_id)[1]

# Get the top 5 predictions for the [MASK] token
predicted_token_ids = predictions[0, mask_token_index].topk(5).indices.squeeze().tolist()
predicted_tokens = [tokenizer.decode(token_id).strip() for token_id in predicted_token_ids]

# Display results
print("Input text:", test_row)
print("Predicted tokens for [MASK]:", predicted_tokens)

Input text: The individual is male and is 19.0 years old. They live in a household with 2.0 members. Their income-to-poverty ratio is 2.44, which is moderate. Their body mass index (BMI) is [MASK], calculated from their weight and height. This indicates they are obese. They answered '0.0' to a question about their dietary habits, and '0.0' to an alternate dietary question. They currently are smokers, and their physical activity includes 4.0 minutes of moderate-intensity activity and 3.0 minutes of vigorous-intensity activity weekly. Their self-reported health status is 3.0 out of 5. On average, they visit restaurants 0.0 times per month and consume 82.0 grams of protein daily. Their healthy food intake is 73.0 servings per day, compared to an unhealthy food intake of 108.0 servings per day. They drink 3.0 beverages daily and consume 10.0 cups of milk daily. Their reported medical conditions include: Condition One=0.0, Condition Two=0.0, Condition Three=0.0, and Condition Four=0.0. Thei

In [None]:
def generate_description_with_mask(row, mask_token="[MASK]", columns_to_mask=None):
    """
    Generate a description for a row with specified columns masked.

    Args:
        row (pd.Series): Row of data.
        mask_token (str): The token to use for masking.
        columns_to_mask (list): List of column names to mask.

    Returns:
        str: Description with specified values replaced by the mask token.
    """
    description = (
        f"The individual is {'male' if row['Gender'] == 1 else 'female'} and is "
        f"{mask_token if 'Age' in columns_to_mask else row['Age']} years old. "
        f"They live in a household with "
        f"{mask_token if 'Household Size' in columns_to_mask else row['Household Size']} members. "
        f"Their income-to-poverty ratio is {row['Income Poverty Ratio']:.2f}, "
        f"which is {'below average' if row['Income Poverty Ratio'] < 1 else 'moderate' if 1 <= row['Income Poverty Ratio'] <= 3 else 'above average'}. "
        f"Their body mass index (BMI) is "
        f"{mask_token if 'Body Mass Index' in columns_to_mask else row['Body Mass Index']:.1f}, "
        f"calculated from their weight and height. This indicates they are "
        f"{'underweight' if row['Body Mass Index'] < 18.5 else 'in the normal range' if 18.5 <= row['Body Mass Index'] < 25 else 'overweight' if 25 <= row['Body Mass Index'] < 30 else 'obese'}. "
        f"They answered '{row['Diet Question One']}' to a question about their dietary habits, and '{row['Diet Question Alternate']}' to an alternate dietary question. "
        f"They currently {'do not smoke' if row['Smoking Status'] == 'No' else 'are smokers'}, and their physical activity includes "
        f"{mask_token if 'Physical Activity One' in columns_to_mask else row['Physical Activity One']} minutes "
        f"of moderate-intensity activity and {row['Physical Activity Two']} minutes of vigorous-intensity activity weekly. "
        f"Their self-reported health status is {row['Health Status']} out of 5. "
        f"On average, they visit restaurants {row['Restaurant Visits']} times per month and consume "
        f"{mask_token if 'Protein Intake' in columns_to_mask else row['Protein Intake']} grams of protein daily. "
        f"Their healthy food intake is {row['Healthy Food Intake']} servings per day, compared to an unhealthy food intake of "
        f"{row['Unhealthy Food Intake']} servings per day. "
        f"They drink {row['Beverage Consumption']} beverages daily and consume {row['Milk Consumption']} cups of milk daily. "
        f"Their reported medical conditions include: Condition One={row['Medical Condition One']}, Condition Two={row['Medical Condition Two']}, "
        f"Condition Three={row['Medical Condition Three']}, and Condition Four={row['Medical Condition Four']}. "
        f"Their overall physical activity status is {row['Physical Activity Status']}, which reflects their general lifestyle and fitness."
    )
    return description