## NLP PROJECT #2

### Student: Jefferson Roesler

### Function to Calculate Calories and Protein

In [None]:
def calculate_calories_and_protein(weight, height, age, gender, activity_level, goal):

    # Mifflin-St Jeor BMR calculation
    if gender.lower() == "male":
        bmr = 10 * weight + 6.25 * height - 5 * age + 5
    elif gender.lower() == "female":
        bmr = 10 * weight + 6.25 * height - 5 * age - 161
    else:
        raise ValueError("Gender must be 'male' or 'female'")

    # Activity factor based on steps and workouts
    activity_factors = {
        "sedentary": 1.2,  # Less than 5,000 steps a day, no workouts
        "lightly active": 1.375,  # ~5,000 steps/day, 1-3 workouts/week
        "moderately active": 1.55,  # ~7,000-10,000 steps/day, 2-5 workouts/week
        "very active": 1.725  # More than 10,000 steps/day, 3-6 workouts/week
    }
    if activity_level not in activity_factors:
        raise ValueError(f"Invalid activity level. Choose from: {list(activity_factors.keys())}")

    # Adjust BMR by activity level
    calories = bmr * activity_factors[activity_level]

    # Adjust for goal
    if goal == "weight loss":
        calories -= 500  # Subtract 500 calories for a deficit
    elif goal == "muscle gain":
        calories += 500  # Add 500 calories for a surplus

    # Protein intake: 2 grams per kilogram of body weight
    protein = weight * 2  # Protein in grams

    return {
        "calories_per_day": round(calories, 2),
        "protein_per_day": round(protein, 2)
    }

# Example usage
result = calculate_calories_and_protein(
    weight=70,
    height=175,
    age=25,
    gender="male",
    activity_level="moderately active",
    goal="muscle gain"
)

print(result)




{'calories_per_day': 3094.31, 'protein_per_day': 140}


In [None]:
# Test inputs
test_input = {
    "weight": 84,              # kg
    "height": 180,             # cm
    "age": 40,                 # years
    "gender": "male",          # "male" or "female"
    "activity_level": "moderately active",  # Activity level
    "goal": "muscle gain"      # Goal: "weight loss", "maintenance", or "weight gain"
}

# Calculate calories
results = calculate_calories_and_protein(
    weight=test_input["weight"],
    height=test_input["height"],
    age=test_input["age"],
    gender=test_input["gender"],
    activity_level=test_input["activity_level"],
    goal=test_input["goal"]
)

print(f"Daily recommended calories and proteins: {results}")


Daily recommended calories and proteins: {'calories_per_day': 3243.5, 'protein_per_day': 168}


## Dataset

In [None]:
import json
import pandas as pd

# Load the JSON file
with open('foundationDownload.json', 'r') as file:
    data = json.load(file)

In [None]:
# Extract the list of foods
foods = data["FoundationFoods"]

# Flatten the data and include serving size
def process_food_data_with_serving_size(foods):
    processed_data = []
    for food in foods:
        # Extract food description
        description = food.get("description", "Unknown")

        # Extract serving size (from 'foodPortions')
        food_portions = food.get("foodPortions", [])
        if food_portions:
            # Assume the first portion is the standard serving size
            serving_size = food_portions[0].get("gramWeight", 0)  # Weight in grams
        else:
            serving_size = 0  # Default if no portion info available

        # Extract nutrients
        nutrients = food.get("foodNutrients", [])
        nutrient_dict = {n["nutrient"]["name"]: n["amount"] for n in nutrients if "amount" in n}

        # Keep only key nutrients and serving size
        important_nutrients = {
            "Description": description,
            "Serving Size (g)": serving_size,
            "Calories": nutrient_dict.get("Energy", 0),
            "Protein": nutrient_dict.get("Protein", 0),
            "Carbohydrates": nutrient_dict.get("Carbohydrate, by difference", 0),
            "Fat": nutrient_dict.get("Total lipid (fat)", 0)
        }
        processed_data.append(important_nutrients)

    return processed_data

# Process the food data with serving size
processed_foods_with_serving_size = process_food_data_with_serving_size(foods)

# Convert to a Pandas DataFrame for easier handling
food_df_with_serving_size = pd.DataFrame(processed_foods_with_serving_size)

# Display the first few rows
print(food_df_with_serving_size.head())


                                         Description  Serving Size (g)  \
0                                 Hummus, commercial              33.9   
1                               Tomatoes, grape, raw              49.7   
2  Beans, snap, green, canned, regular pack, drai...             129.0   
3                        Frankfurter, beef, unheated              48.6   
4        Nuts, almonds, dry roasted, with salt added             135.0   

   Calories  Protein  Carbohydrates    Fat  
0     229.0     7.35          14.90  17.10  
1     113.0     0.83           5.51   0.63  
2      86.0     1.04           4.11   0.39  
3    1310.0    11.70           2.89  28.00  
4    2590.0    20.40          16.20  57.80  


## Add columns for calories/protein per gram.


In [None]:
# Add new columns for calories and protein per gram
food_df_with_serving_size["Calories per Gram"] = food_df_with_serving_size["Calories"] / food_df_with_serving_size["Serving Size (g)"]
food_df_with_serving_size["Protein per Gram"] = food_df_with_serving_size["Protein"] / food_df_with_serving_size["Serving Size (g)"]

# Replace infinite or NaN values (e.g., where serving size is 0)
food_df_with_serving_size.replace([float('inf'), float('-inf')], 0, inplace=True)
food_df_with_serving_size.fillna(0, inplace=True)

# Display the updated DataFrame
print(food_df_with_serving_size.head())


                                         Description  Serving Size (g)  \
0                                 Hummus, commercial              33.9   
1                               Tomatoes, grape, raw              49.7   
2  Beans, snap, green, canned, regular pack, drai...             129.0   
3                        Frankfurter, beef, unheated              48.6   
4        Nuts, almonds, dry roasted, with salt added             135.0   

   Calories  Protein  Carbohydrates    Fat  Calories per Gram  \
0     229.0     7.35          14.90  17.10           6.755162   
1     113.0     0.83           5.51   0.63           2.273642   
2      86.0     1.04           4.11   0.39           0.666667   
3    1310.0    11.70           2.89  28.00          26.954733   
4    2590.0    20.40          16.20  57.80          19.185185   

   Protein per Gram  
0          0.216814  
1          0.016700  
2          0.008062  
3          0.240741  
4          0.151111  


## User Profile

In [None]:
# Define a global variable for the user profile
user_profile = {}

# Function to set up the user profile
def set_user_profile(weight, height, age, gender, activity_level, goal):
    global user_profile
    # Use the existing calculator to determine calories and protein
    user_data = calculate_calories_and_protein(
        weight=weight,
        height=height,
        age=age,
        gender=gender,
        activity_level=activity_level,
        goal=goal
    )
    # Store the data in the user profile
    user_profile = {
        "weight": weight,
        "height": height,
        "age": age,
        "gender": gender,
        "activity_level": activity_level,
        "goal": goal,
        "calories_per_day": user_data["calories_per_day"],
        "protein_per_day": user_data["protein_per_day"]
    }
    print("User profile created!")
    return user_profile

# Example usage
user_profile = set_user_profile(
    weight=70, height=175, age=25, gender="male",
    activity_level="moderately active", goal="muscle gain"
)

print(user_profile)


User profile created!
{'weight': 70, 'height': 175, 'age': 25, 'gender': 'male', 'activity_level': 'moderately active', 'goal': 'muscle gain', 'calories_per_day': 3094.31, 'protein_per_day': 140}


In [None]:
# Function to get user-specific needs
def get_user_needs():
    if not user_profile:
        return "No user profile found. Please set up your profile first."
    return f"Your daily calorie goal is {user_profile['calories_per_day']} kcal, and your protein goal is {user_profile['protein_per_day']} g."

# Example usage
print(get_user_needs())


Your daily calorie goal is 3094.31 kcal, and your protein goal is 140 g.


## Food Evaluation

In [None]:
# Evaluate food based on user profile
def evaluate_food_personalized(food_name, df):
    if not user_profile:
        return "Please set up your user profile first."

    # Fetch food details
    food_details = get_food_details(food_name, df)
    if isinstance(food_details, str):  # If food not found
        return food_details

    # Extract user-specific goals
    goal = user_profile["goal"]
    daily_calories = user_profile["calories_per_day"]
    daily_protein = user_profile["protein_per_day"]

    # Extract food details
    calories_per_gram = food_details["Calories per Gram"]
    protein_per_gram = food_details["Protein per Gram"]

    # Evaluate based on goal
    if goal == "weight loss":
        if calories_per_gram > 2:
            return f"{food_name} is calorie-dense ({calories_per_gram:.2f} kcal/g), so it may not align with your weight loss goal."
        else:
            return f"{food_name} is low in calories ({calories_per_gram:.2f} kcal/g) and fits your weight loss goal!"
    elif goal == "muscle gain":
        if protein_per_gram > 0.2:
            return f"{food_name} is rich in protein ({protein_per_gram:.2f} g/g), making it great for muscle gain."
        else:
            return f"{food_name} has low protein content ({protein_per_gram:.2f} g/g), so it might not be ideal for muscle gain."
    else:
        return "Invalid goal in profile. Please set a valid goal."


In [None]:
# Search for food details by name
def get_food_details(food_name, df):
    # Find the food item in the dataset
    match = df[df["Description"].str.contains(food_name, case=False, na=False)]
    if match.empty:
        return f"Sorry, I couldn't find any information on '{food_name}'."
    else:
        return match.iloc[0].to_dict()  # Return the first match as a dictionary


In [None]:
# Evaluate food based on user profile
def evaluate_food_personalized(food_name, df):
    if not user_profile:
        return "Please set up your user profile first."

    # Fetch food details
    food_details = get_food_details(food_name, df)
    if isinstance(food_details, str):  # If food not found
        return food_details

    # Extract user-specific goals
    goal = user_profile["goal"]
    daily_calories = user_profile["calories_per_day"]
    daily_protein = user_profile["protein_per_day"]

    # Extract food details
    calories_per_gram = food_details["Calories per Gram"]
    protein_per_gram = food_details["Protein per Gram"]

    # Evaluate based on goal
    if goal == "weight loss":
        if calories_per_gram > 2:
            return f"{food_name} is calorie-dense ({calories_per_gram:.2f} kcal/g), so it may not align with your weight loss goal."
        else:
            return f"{food_name} is low in calories ({calories_per_gram:.2f} kcal/g) and fits your weight loss goal!"
    elif goal == "muscle gain":
        if protein_per_gram > 0.2:
            return f"{food_name} is rich in protein ({protein_per_gram:.2f} g/g), making it great for muscle gain."
        else:
            return f"{food_name} has low protein content ({protein_per_gram:.2f} g/g), so it might not be ideal for muscle gain."
    else:
        return "Invalid goal in profile. Please set a valid goal."


## Testing

In [None]:
# Interactive script
def interactive_test(df):
    print("Welcome! Let's set up your profile.")
    weight = float(input("Enter your weight (kg): "))
    height = float(input("Enter your height (cm): "))
    age = int(input("Enter your age: "))
    gender = input("Enter your gender (male/female): ")
    activity_level = input("Enter your activity level (sedentary/lightly active/moderately active/very active): ")
    goal = input("What is your goal? (weight loss/muscle gain): ").lower()

    # Set up the profile
    set_user_profile(weight, height, age, gender, activity_level, goal)

    print("\nYour profile has been created.")
    print(get_user_needs())

    while True:
        # Ask about food
        food_name = input("\nEnter the name of a food to evaluate (or type 'exit' to quit): ")
        if food_name.lower() == "exit":
            print("Goodbye!")
            break
        evaluation = evaluate_food_personalized(food_name, df)
        print(evaluation)

# Run the interactive test
interactive_test(food_df_with_serving_size)


Welcome! Let's set up your profile.
Enter your weight (kg): 84
Enter your height (cm): 180
Enter your age: 39
Enter your gender (male/female): male
Enter your activity level (sedentary/lightly active/moderately active/very active): moderately active
What is your goal? (weight loss/muscle gain): muscle gain
User profile created!

Your profile has been created.
Your daily calorie goal is 3251.25 kcal, and your protein goal is 168.0 g.

Enter the name of a food to evaluate (or type 'exit' to quit): garlic
garlic has low protein content (0.00 g/g), so it might not be ideal for muscle gain.

Enter the name of a food to evaluate (or type 'exit' to quit): exit
Goodbye!


## Enhance Chatbot Responses

In [None]:
# Enhanced conversational responses
def evaluate_food_personalized(food_name, df):
    if not user_profile:
        return "Please set up your user profile first."

    # Fetch food details
    food_details = get_food_details(food_name, df)
    if isinstance(food_details, str):  # If food not found
        return food_details

    # Extract user-specific goals
    goal = user_profile["goal"]
    calories_per_gram = food_details["Calories per Gram"]
    protein_per_gram = food_details["Protein per Gram"]

    # Friendly responses
    if goal == "weight loss":
        if calories_per_gram > 2:
            return (
                f"{food_name.capitalize()} has {calories_per_gram:.2f} kcal per gram, "
                "which is a bit calorie-dense. It might not be the best option if you're trying to lose weight, "
                "but you can still enjoy it in moderation!"
            )
        else:
            return (
                f"{food_name.capitalize()} is a great choice for weight loss! "
                f"It has just {calories_per_gram:.2f} kcal per gram, making it a low-calorie option."
            )
    elif goal == "muscle gain":
        if protein_per_gram > 0.2:
            return (
                f"{food_name.capitalize()} is rich in protein with {protein_per_gram:.2f} g per gram. "
                "It's a fantastic option to support your muscle gain goals!"
            )
        else:
            return (
                f"{food_name.capitalize()} has {protein_per_gram:.2f} g of protein per gram, "
                "which is relatively low. You might want to choose higher-protein foods."
            )
    else:
        return "Invalid goal in profile. Please set a valid goal (weight loss or muscle gain)."


## Generate Fine-Tuning Data

In [None]:
import json

# Generate fine-tuning data
def generate_fine_tuning_data(df):
    fine_tuning_data = []
    for _, row in df.iterrows():
        food_name = row["Description"]
        calories_per_gram = row["Calories per Gram"]
        protein_per_gram = row["Protein per Gram"]

        # Add Q&A examples
        fine_tuning_data.append({
            "prompt": f"I want to lose weight. Is {food_name} good for me?",
            "completion": (
                f"{food_name} {'is' if calories_per_gram <= 2 else 'is not'} good for weight loss. "
                f"It has {calories_per_gram:.2f} kcal per gram."
            )
        })
        fine_tuning_data.append({
            "prompt": f"I want to gain muscle. Is {food_name} a good choice?",
            "completion": (
                f"{food_name} {'is' if protein_per_gram > 0.2 else 'is not'} a good choice for muscle gain. "
                f"It contains {protein_per_gram:.2f} g of protein per gram."
            )
        })

    return fine_tuning_data

# Generate data
fine_tuning_data = generate_fine_tuning_data(food_df_with_serving_size)

# Save to JSONL file
with open("fine_tuning_data.jsonl", "w") as f:
    for item in fine_tuning_data:
        json.dump(item, f)
        f.write("\n")

print("Fine-tuning data saved as fine_tuning_data.jsonl!")


Fine-tuning data saved as fine_tuning_data.jsonl!


## GPT 2 Part

In [None]:
# pip install transformers datasets accelerate

In [None]:
# Convert your JSONL file to a dataset with input_text and target_text

import pandas as pd

# Load your JSONL fine-tuning data
data = []
with open("fine_tuning_data.jsonl", "r") as f:
    for line in f:
        data.append(json.loads(line))

# Convert to DataFrame for easier handling
df = pd.DataFrame(data)

# Save in Hugging Face-friendly format
df.to_csv("fine_tuning_data.csv", index=False)
print("Dataset saved as fine_tuning_data.csv!")


Dataset saved as fine_tuning_data.csv!


In [None]:
#pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
from datasets import load_dataset

# Load dataset
dataset = load_dataset("csv", data_files="fine_tuning_data.csv")
train_dataset = dataset["train"]

print(train_dataset[0])


Generating train split: 0 examples [00:00, ? examples/s]

{'prompt': 'I want to lose weight. Is Hummus, commercial good for me?', 'completion': 'Hummus, commercial is not good for weight loss. It has 6.76 kcal per gram.'}


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load fine-tuned model and tokenizer
model = AutoModelForCausalLM.from_pretrained("/content/fine_tuned_distilgpt2")
tokenizer = AutoTokenizer.from_pretrained("/content/fine_tuned_distilgpt2")

# Function to generate responses
def generate_response(prompt):
    # Tokenize input with padding and truncation
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=256)

    # Generate response with improved parameters
    outputs = model.generate(
        **inputs,
        max_length=150,
        temperature=0.7,
        do_sample=True,  # Enable sampling
        top_k=50,  # Limit to top 50 words
        top_p=0.9,  # Nucleus sampling
        repetition_penalty=2.0,  # Penalize repetition
        pad_token_id=tokenizer.pad_token_id  # Proper handling of padding
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test the model
prompt = "I want to lose weight. Is hummus good for me?"
response = generate_response(prompt)
print(response)



I want to lose weight. Is hummus good for me? No, but I'm going back on that diet because it's a great way to get the most out of my body and not have any issues."
"It works," said Pauline in response: "But don't try trying hard at all! You know what you're doing is just getting rid (of) fat cells; they are making up your muscle tissue!"


### Results not so good.

### Let's add structure to my prompts for better context.

## Prompt 2

### Added better context + clean response

In [None]:
def generate_response2(food, goal, question):

    # Create structured prompt
    prompt = f"Food: {food}\nGoal: {goal}\nQuestion: {question}"

    # Tokenize input with padding and truncation
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=256)

    # Generate response with improved parameters
    outputs = model.generate(
        **inputs,
        max_length=150,
        temperature=0.7,
        do_sample=True,
        top_k=50,
        top_p=0.9,
        repetition_penalty=2.0,
        pad_token_id=tokenizer.pad_token_id
    )

    # Decode the output
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response


In [None]:
# Test the function
response = generate_response2(
    food="Hummus",
    goal="Weight Loss",
    question="Is hummus good for me?"
)
print(response)


Food: Hummus
Goal: Weight Loss
Question: Is hummus good for me?

.I've had it and I'm happy with the results, but still think that if you're not eating a lot of food then your body won't be able to use up all its energy because there's just too much starch in this meal (which is why most people are using beans). If we eat enough grains or pasta every day instead so that our bodies can get used after having eaten whole foods before getting tired from them my weight will go down as well! It'll also help us lose fat faster than any other type diet . Also on top...that being said , don´t expect some kind mass gain during exercise - especially when compared


In [None]:
def clean_response(response):
    # Split response into sentences
    sentences = response.split(".")

    # Return the first two sentences for brevity
    cleaned_response = ". ".join(sentences[:2]).strip()
    return cleaned_response

# Test the clean-up
response = generate_response2(
    food="Hummus",
    goal="Weight Loss",
    question="Is hummus good for me?"
)
cleaned_response = clean_response(response)
print(cleaned_response)


Food: Hummus
Goal: Weight Loss
Question: Is hummus good for me? Answer: Well, I'm a big fan of the stuff it provides.  But when you're not trying to lose weight or make changes in your diet that would be great news! If so then this is an easy recipe with no need on my part (except maybe one) and can help keep people from getting sick over time if they don't want their health restored before long!!


### Testing Prompt 2

In [None]:
# Example 1
print(generate_response2(
    food="Bacon",
    goal="Weight Loss",
    question="Is bacon good for my diet?"
))

# Example 2
print(generate_response2(
    food="Eggs",
    goal="Muscle Gain",
    question="How much protein do eggs have?"
))

# Example 3
print(generate_response2(
    food="Chicken",
    goal="Weight Loss",
    question="Is chicken low in calories?"
))


Food: Bacon
Goal: Weight Loss
Question: Is bacon good for my diet? Answer to question 1 of 3. (Click here to download)


Food: Eggs
Goal: Muscle Gain
Question: How much protein do eggs have? Can they help you lose weight and maintain your healthy body mass, or is it something that we should avoid doing at all costs. (Answer) We can't give up on our natural eating habits if there's not enough evidence to support those claims! In fact this has been shown in some studies where women were given a diet consisting of 8% carbohydrate instead thereof which caused them considerable pain for several weeks after the experiment was started due simply because their bodies didn´t metabolize carbs properly by themselves - although I think even more research needs being done regarding how fat intake affects muscle growth... Also please consider what kind "caffeine" foods are available from reputable
Food: Chicken
Goal: Weight Loss
Question: Is chicken low in calories? Answer : Yes. However, the high a

Analysis of Issues

Verbose and Unstructured Output: the model generates overly long and irrelevant sentences because it lacks focus. The prompt structure is not effectively guiding the model's behavior.

Lack of Domain-Specific Knowledge:the model relies too heavily on pre-trained general knowledge rather than fine-tuned task-specific examples.

Unclear Training Signal: the fine-tuning data may not have enough diversity or explicit examples to enforce concise, factual, and relevant outputs.

# Prompt 3

### Added explicit instructions

In [None]:
def generate_response3(food, goal, question):
    # Create structured and constrained prompt
    prompt = (
        "Here is an example:\n"
        "Food: Bacon\n"
        "Goal: Weight Loss\n"
        "Question: Is bacon good for weight loss?\n"
        "Answer: No, bacon is high in calories and fat, making it a poor choice for weight loss.\n\n"
        "Now, answer the following:\n"
        f"Food: {food}\n"
        f"Goal: {goal}\n"
        f"Question: Is {food} good for {goal}?\n"
        "Answer:"
    )

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=256)

    # Generate response with constraints
    outputs = model.generate(
        **inputs,
        max_length=150,
        temperature=0.7,
        do_sample=True,
        top_k=50,
        top_p=0.9,
        repetition_penalty=2.5,
        pad_token_id=tokenizer.pad_token_id
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [None]:
def clean_response2(response):
    # Extract sentences and look for a clear answer
    sentences = response.split(". ")

    # Return only the first sentence containing the key context
    for sentence in sentences:
        if "calorie" in sentence.lower() or "protein" in sentence.lower() or "fat" in sentence.lower():
            return sentence.strip()

    # Fallback: Return the first sentence if no key context is found
    return sentences[0].strip()


In [None]:
# Test the function with the new structure
response = generate_response3(
    food="Hummus",
    goal="Weight Loss",
    question="Is hummus good for me?"
)
cleaned_response = clean_response2(response)
print("Cleaned Response:", cleaned_response)


Cleaned Response: Here is an example:
Food: Bacon
Goal: Weight Loss
Question: Is bacon good for weight loss?
Answer: No, bacon is high in calories and fat, making it a poor choice for weight loss.

Now, answer the following:
Food: Hummus
Goal: Weight Loss
Question: Is Hummus good for Weight Loss?
Answer: Yes – I don't think so! It's not even close to that level of calorie restriction you're talking about here…it really isn`t like eating too much food at once or anything!! All we have left are two options (no more than 2 meals per day) with one being very low-carb while another could be either lean meatless veggies


## Prompt 4

### Added facts

In [None]:
def generate_response_with_facts(food, facts, goal):
    # Add structured prompt with facts
    prompt = (
        f"Food: {food}\n"
        f"Facts: {facts}\n"
        f"Goal: {goal}\n"
        f"Question: Is {food} good for {goal}?\n"
        "Answer:"
    )


    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=256)

    # Generate response
    outputs = model.generate(
        **inputs,
        max_length=100,
        temperature=0.7,
        do_sample=True,
        top_k=50,
        top_p=0.9,
        repetition_penalty=2.5,
        pad_token_id=tokenizer.pad_token_id
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test with factual data
facts = "Hummus contains 166 calories and 7.9g of protein per serving."
response = generate_response_with_facts(food="Hummus", facts=facts, goal="Weight Loss")
print("Response:", response)


Response: Food: Hummus
Facts: Hummus contains 166 calories and 7.9g of protein per serving.
Goal: Weight Loss
Question: Is Hummus good for Weight Loss?
Answer: It does not cause any weight loss, but it may lead to a temporary decrease in your body's ability "to digest fats" or produce energy that is lost when you lose fat (in other words gain muscle mass). In addition there are various types available which will help reduce the amount spent


In [None]:
def clean_response_contextual(response):
    # Split response into sentences
    sentences = response.split(". ")

    # Look for key context in sentences
    for sentence in sentences:
        if any(keyword in sentence.lower() for keyword in ["yes", "no", "calorie", "protein", "fat"]):
            return sentence.strip()

    # Default: Return the first sentence
    return sentences[0].strip()


In [None]:
facts = "Hummus contains 166 calories and 7.9g of protein per serving."
response = generate_response_with_facts(food="Hummus", facts=facts, goal="Weight Loss")
cleaned_response = clean_response_contextual(response)
print("Cleaned Response:", cleaned_response)


Cleaned Response: Food: Hummus
Facts: Hummus contains 166 calories and 7.9g of protein per serving.
Goal: Weight Loss
Question: Is Hummus good for Weight Loss?
Answer: It is an excellent source, particularly if you're looking to lose weight in a healthy way or when working out hard (no one knows how it works)


### Facts really helped the answers.

In [None]:
import json

# Load dataset
with open("foundationDownload.json", "r") as f:
    data = json.load(f)

# Extract the list of foods
foods = data["FoundationFoods"]

# Function to extract key nutrients
def process_food_data(foods):
    processed_data = []
    for food in foods:
        description = food.get("description", "Unknown")
        nutrients = food.get("foodNutrients", [])

        # Extract nutrient amounts
        nutrient_dict = {n["nutrient"]["name"]: n["amount"] for n in nutrients if "amount" in n}

        # Extract relevant nutrients
        important_nutrients = {
            "Description": description,
            "Calories": nutrient_dict.get("Energy", 0),
            "Protein": nutrient_dict.get("Protein", 0),
            "Fat": nutrient_dict.get("Total lipid (fat)", 0),
        }
        processed_data.append(important_nutrients)

    return processed_data

# Process food data
processed_foods = process_food_data(foods)




In [None]:
processed_foods[0]

{'Description': 'Hummus, commercial',
 'Calories': 229,
 'Protein': 7.35,
 'Fat': 17.1}

In [None]:
def generate_prompts(processed_foods):
    prompts = []
    for food in processed_foods:
        description = food["Description"]
        calories = food["Calories"]
        protein = food["Protein"]
        fat = food["Fat"]

        # Generate prompt for weight loss
        weight_loss_prompt = {
            "prompt": f"Food: {description}\nFacts: {calories} calories, {protein}g protein, {fat}g fat per serving.\nGoal: Weight Loss\nQuestion: Is {description} good for weight loss?\nAnswer:",
            "completion": f" Yes, {description} is low in calories and high in protein, making it a good choice for weight loss when eaten in moderation."
        }

        # Generate prompt for muscle gain
        muscle_gain_prompt = {
            "prompt": f"Food: {description}\nFacts: {calories} calories, {protein}g protein, {fat}g fat per serving.\nGoal: Muscle Gain\nQuestion: Is {description} good for muscle gain?\nAnswer:",
            "completion": f" Yes, {description} is high in protein, making it an excellent choice for muscle gain."
        }

        prompts.extend([weight_loss_prompt, muscle_gain_prompt])

    return prompts

# Generate prompts
prompts = generate_prompts(processed_foods)

# Save to JSONL file
with open("fine_tuning_prompts.jsonl", "w") as f:
    for item in prompts:
        json.dump(item, f)
        f.write("\n")

print("Prompts saved to fine_tuning_prompts.jsonl")


Prompts saved to fine_tuning_prompts.jsonl


In [None]:
from datasets import load_dataset

# Load the JSONL dataset
dataset = load_dataset("json", data_files={"train": "fine_tuning_prompts.jsonl"})

# Split into train and validation sets (90% train, 10% validation)
dataset = dataset["train"].train_test_split(test_size=0.1)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

print(train_dataset[0])  # Check the structure


FileNotFoundError: Unable to find '/content/fine_tuning_prompts.jsonl'

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load pre-trained model and tokenizer
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Add padding token if not present
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))


Embedding(50258, 768)

In [None]:
def tokenize_function(examples):
    # Combine prompt and completion with a separator
    full_text = [f"{prompt} [SEP] {completion}" for prompt, completion in zip(examples["prompt"], examples["completion"])]
    return tokenizer(
        full_text,
        truncation=True,  # Truncate sequences longer than max_length
        max_length=512,  # Ensure consistent token lengths
        padding="max_length"  # Add padding for consistent lengths
    )



In [None]:
# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns
train_dataset = train_dataset.remove_columns(["prompt", "completion"])
eval_dataset = eval_dataset.remove_columns(["prompt", "completion"])

# Set format for PyTorch
train_dataset.set_format("torch")
eval_dataset.set_format("torch")

NameError: name 'train_dataset' is not defined