## NLP PROJECT #2

### Student: Jefferson Roesler

# Dataset

In [1]:
import json
import pandas as pd

# Load the JSON file
with open('foundationDownload.json', 'r') as file:
    data = json.load(file)

In [2]:
# Extract the list of foods
foods = data["FoundationFoods"]

# Flatten the data and include serving size
def process_food_data_with_serving_size(foods):
    processed_data = []
    for food in foods:
        # Extract food description
        description = food.get("description", "Unknown")

        # Extract serving size (from 'foodPortions')
        food_portions = food.get("foodPortions", [])
        if food_portions:
            # Assume the first portion is the standard serving size
            serving_size = food_portions[0].get("gramWeight", 0)  # Weight in grams
        else:
            serving_size = 0  # Default if no portion info available

        # Extract nutrients
        nutrients = food.get("foodNutrients", [])
        nutrient_dict = {n["nutrient"]["name"]: n["amount"] for n in nutrients if "amount" in n}

        # Keep only key nutrients and serving size
        important_nutrients = {
            "Description": description,
            "Serving Size (g)": serving_size,
            "Calories": nutrient_dict.get("Energy", 0),
            "Protein": nutrient_dict.get("Protein", 0),
            "Carbohydrates": nutrient_dict.get("Carbohydrate, by difference", 0),
            "Fat": nutrient_dict.get("Total lipid (fat)", 0)
        }
        processed_data.append(important_nutrients)

    return processed_data

# Process the food data with serving size
processed_foods_with_serving_size = process_food_data_with_serving_size(foods)

# Convert to a Pandas DataFrame for easier handling
food_df_with_serving_size = pd.DataFrame(processed_foods_with_serving_size)

# Display the first few rows
print(food_df_with_serving_size.head())


                                         Description  Serving Size (g)  \
0                                 Hummus, commercial              33.9   
1                               Tomatoes, grape, raw              49.7   
2  Beans, snap, green, canned, regular pack, drai...             129.0   
3                        Frankfurter, beef, unheated              48.6   
4        Nuts, almonds, dry roasted, with salt added             135.0   

   Calories  Protein  Carbohydrates    Fat  
0     229.0     7.35          14.90  17.10  
1     113.0     0.83           5.51   0.63  
2      86.0     1.04           4.11   0.39  
3    1310.0    11.70           2.89  28.00  
4    2590.0    20.40          16.20  57.80  


## Add columns for calories/protein per gram.


In [3]:
# # Add new columns for calories and protein per gram
# food_df_with_serving_size["Calories per Gram"] = food_df_with_serving_size["Calories"] / food_df_with_serving_size["Serving Size (g)"]
# food_df_with_serving_size["Protein per Gram"] = food_df_with_serving_size["Protein"] / food_df_with_serving_size["Serving Size (g)"]

# # Replace infinite or NaN values (e.g., where serving size is 0)
# food_df_with_serving_size.replace([float('inf'), float('-inf')], 0, inplace=True)
# food_df_with_serving_size.fillna(0, inplace=True)

# # Display the updated DataFrame
# print(food_df_with_serving_size.head())


                                         Description  Serving Size (g)  \
0                                 Hummus, commercial              33.9   
1                               Tomatoes, grape, raw              49.7   
2  Beans, snap, green, canned, regular pack, drai...             129.0   
3                        Frankfurter, beef, unheated              48.6   
4        Nuts, almonds, dry roasted, with salt added             135.0   

   Calories  Protein  Carbohydrates    Fat  Calories per Gram  \
0     229.0     7.35          14.90  17.10           6.755162   
1     113.0     0.83           5.51   0.63           2.273642   
2      86.0     1.04           4.11   0.39           0.666667   
3    1310.0    11.70           2.89  28.00          26.954733   
4    2590.0    20.40          16.20  57.80          19.185185   

   Protein per Gram  
0          0.216814  
1          0.016700  
2          0.008062  
3          0.240741  
4          0.151111  


## New Function to add Protein and Calories columns

In [3]:
# Replace zero Serving Size (g) with NaN for proper handling
food_df_with_serving_size["Serving Size (g)"] = food_df_with_serving_size["Serving Size (g)"].replace(0, pd.NA)

# Replace NaN in Serving Size (g) with median or group-based imputed value if not already done
# Example: Global median
median_serving_size = food_df_with_serving_size["Serving Size (g)"].median()
food_df_with_serving_size["Serving Size (g)"].fillna(median_serving_size, inplace=True)

# Recalculate Calories per Gram
food_df_with_serving_size["Calories per Gram"] = food_df_with_serving_size["Calories"] / food_df_with_serving_size["Serving Size (g)"]

# Recalculate Protein per Gram
food_df_with_serving_size["Protein per Gram"] = food_df_with_serving_size["Protein"] / food_df_with_serving_size["Serving Size (g)"]

# Replace infinite or NaN values in the derived columns
food_df_with_serving_size.replace([float('inf'), float('-inf')], 0, inplace=True)
food_df_with_serving_size.fillna(0, inplace=True)

# Display the updated DataFrame
print("Updated DataFrame with recalculated values:")
print(food_df_with_serving_size[["Description", "Serving Size (g)", "Calories per Gram", "Protein per Gram"]].head())


Updated DataFrame with recalculated values:
                                         Description  Serving Size (g)  \
0                                 Hummus, commercial              33.9   
1                               Tomatoes, grape, raw              49.7   
2  Beans, snap, green, canned, regular pack, drai...             129.0   
3                        Frankfurter, beef, unheated              48.6   
4        Nuts, almonds, dry roasted, with salt added             135.0   

   Calories per Gram  Protein per Gram  
0           6.755162          0.216814  
1           2.273642          0.016700  
2           0.666667          0.008062  
3          26.954733          0.240741  
4          19.185185          0.151111  


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  food_df_with_serving_size["Serving Size (g)"].fillna(median_serving_size, inplace=True)
  food_df_with_serving_size["Serving Size (g)"].fillna(median_serving_size, inplace=True)


# Preprocessing

## Handling Food Description

Description is too long and has useless info. I want to help the chatbot find later which food the user is talking about.

In [4]:
# Function to simplify and remove commas in food descriptions
def simplify_description(description):
    # Split by commas and join the first two parts without commas
    parts = description.split(",")
    return " ".join(parts[:2]).strip() if len(parts) > 1 else description.strip()

# Apply the simplification function to the Description column
food_df_with_serving_size["Description"] = food_df_with_serving_size["Description"].apply(simplify_description)

# Display the updated DataFrame
print(food_df_with_serving_size.head())




          Description  Serving Size (g)  Calories  Protein  Carbohydrates  \
0  Hummus  commercial              33.9     229.0     7.35          14.90   
1     Tomatoes  grape              49.7     113.0     0.83           5.51   
2         Beans  snap             129.0      86.0     1.04           4.11   
3   Frankfurter  beef              48.6    1310.0    11.70           2.89   
4       Nuts  almonds             135.0    2590.0    20.40          16.20   

     Fat  Calories per Gram  Protein per Gram  
0  17.10           6.755162          0.216814  
1   0.63           2.273642          0.016700  
2   0.39           0.666667          0.008062  
3  28.00          26.954733          0.240741  
4  57.80          19.185185          0.151111  


In [5]:
# Display all columns in the DataFrame
print("Columns in the dataset:")
print(food_df_with_serving_size.columns)

# Optionally, display the first few rows to inspect the data
print("\nSample data:")
print(food_df_with_serving_size.head())


Columns in the dataset:
Index(['Description', 'Serving Size (g)', 'Calories', 'Protein',
       'Carbohydrates', 'Fat', 'Calories per Gram', 'Protein per Gram'],
      dtype='object')

Sample data:
          Description  Serving Size (g)  Calories  Protein  Carbohydrates  \
0  Hummus  commercial              33.9     229.0     7.35          14.90   
1     Tomatoes  grape              49.7     113.0     0.83           5.51   
2         Beans  snap             129.0      86.0     1.04           4.11   
3   Frankfurter  beef              48.6    1310.0    11.70           2.89   
4       Nuts  almonds             135.0    2590.0    20.40          16.20   

     Fat  Calories per Gram  Protein per Gram  
0  17.10           6.755162          0.216814  
1   0.63           2.273642          0.016700  
2   0.39           0.666667          0.008062  
3  28.00          26.954733          0.240741  
4  57.80          19.185185          0.151111  


## Missing Values

In [7]:
# Check for missing values
missing_values = food_df_with_serving_size.isnull().sum()

# Calculate the percentage of missing values
missing_percentage = (missing_values / len(food_df_with_serving_size)) * 100

# Combine into a DataFrame for better readability
missing_summary = pd.DataFrame({
    "Column": food_df_with_serving_size.columns,
    "Missing Values": missing_values,
    "Percentage (%)": missing_percentage
}).sort_values(by="Percentage (%)", ascending=False)

# Display the missing value summary
print(missing_summary)


                              Column  Missing Values  Percentage (%)
Description              Description               0             0.0
Serving Size (g)    Serving Size (g)               0             0.0
Calories                    Calories               0             0.0
Protein                      Protein               0             0.0
Carbohydrates          Carbohydrates               0             0.0
Fat                              Fat               0             0.0
Calories per Gram  Calories per Gram               0             0.0
Protein per Gram    Protein per Gram               0             0.0


## Zero Values and Outliers

In [10]:
# Count rows with Serving Size (g) equal to zero
zero_serving_size_count = (food_df_with_serving_size["Serving Size (g)"] == 0).sum()
print(f"Number of foods with zero serving size: {zero_serving_size_count}")


Number of foods with zero serving size: 0


In [8]:
# Check for zero values in numeric columns
zero_values_summary = (food_df_with_serving_size == 0).sum()

# Calculate the percentage of zero values
zero_values_percentage = (zero_values_summary / len(food_df_with_serving_size)) * 100

# Combine into a DataFrame for readability
zero_summary = pd.DataFrame({
    "Column": food_df_with_serving_size.columns,
    "Zero Values": zero_values_summary,
    "Percentage (%)": zero_values_percentage
}).sort_values(by="Percentage (%)", ascending=False)

# Display the zero value summary
print("Zero Value Summary:")
print(zero_summary)



Zero Value Summary:
                              Column  Zero Values  Percentage (%)
Calories                    Calories          219       69.303797
Calories per Gram  Calories per Gram          219       69.303797
Carbohydrates          Carbohydrates           59       18.670886
Protein                      Protein           14        4.430380
Protein per Gram    Protein per Gram           14        4.430380
Fat                              Fat           10        3.164557
Description              Description            0        0.000000
Serving Size (g)    Serving Size (g)            0        0.000000


## Handling Calories Column = zero

In [11]:
# Filter rows where Calories is equal to zero
zero_calories_rows = food_df_with_serving_size[food_df_with_serving_size["Calories"] == 0]

# Display the rows with zero Calories
print("Rows with zero Calories:")
print(zero_calories_rows)


Rows with zero Calories:
              Description  Serving Size (g)  Calories  Protein  Carbohydrates  \
61            Salt  table               6.1       0.0     0.00            0.0   
74             Beans  Dry              97.3       0.0    25.50            0.0   
75             Beans  Dry              97.3       0.0    21.30            0.0   
76             Beans  Dry              97.3       0.0    23.30            0.0   
77             Beans  Dry              97.3       0.0    25.60            0.0   
..                    ...               ...       ...      ...            ...   
311   Sorghum bran  white              97.3       0.0    11.20           68.7   
312  Sorghum flour  white              97.3       0.0    10.20           73.5   
313  Sorghum grain  white              97.3       0.0    10.20           74.9   
314  Sorghum  whole grain              97.3       0.0    10.10           73.6   
315   Plantains  overripe              97.3       0.0     1.17           29.2   

  

In [12]:
# Display the first 10 rows with zero Calories
print(zero_calories_rows.head(10))

# Optionally, export the filtered rows to a CSV for analysis
zero_calories_rows.to_csv("zero_calories_rows.csv", index=False)


    Description  Serving Size (g)  Calories  Protein  Carbohydrates   Fat  \
61  Salt  table               6.1       0.0      0.0            0.0  0.00   
74   Beans  Dry              97.3       0.0     25.5            0.0  1.04   
75   Beans  Dry              97.3       0.0     21.3            0.0  1.16   
76   Beans  Dry              97.3       0.0     23.3            0.0  0.86   
77   Beans  Dry              97.3       0.0     25.6            0.0  1.12   
78   Beans  Dry              97.3       0.0     26.8            0.0  1.14   
79   Beans  Dry              97.3       0.0     24.6            0.0  1.28   
80   Beans  Dry              97.3       0.0     25.2            0.0  1.44   
81   Beans  Dry              97.3       0.0     24.4            0.0  1.23   
82   Beans  Dry              97.3       0.0     25.0            0.0  1.03   

    Calories per Gram  Protein per Gram  
61                0.0          0.000000  
74                0.0          0.262076  
75                0.0     

1. Validate Zero Calories:
Foods like salt are valid with zero calories, so these can be excluded from further processing.
For the remaining rows, we can estimate calories based on macronutrient values using the standard formula:

Calories
=
4
×

Protein (g)
+
4
×

Carbohydrates (g)
+
9
×
Fat (g)

Calories=4×Protein (g)+4×Carbohydrates (g)+9×Fat (g)

2. Fill Missing Calories:
Replace zero calorie values with the calculated estimates.


In [13]:
# Identify rows where Calories is zero but Protein, Carbohydrates, or Fat are non-zero
non_salt_rows = food_df_with_serving_size[
    (food_df_with_serving_size["Calories"] == 0) &
    ((food_df_with_serving_size["Protein"] > 0) |
     (food_df_with_serving_size["Carbohydrates"] > 0) |
     (food_df_with_serving_size["Fat"] > 0))
]

# Calculate calories using the macronutrient formula
food_df_with_serving_size.loc[non_salt_rows.index, "Calories"] = (
    4 * food_df_with_serving_size.loc[non_salt_rows.index, "Protein"] +
    4 * food_df_with_serving_size.loc[non_salt_rows.index, "Carbohydrates"] +
    9 * food_df_with_serving_size.loc[non_salt_rows.index, "Fat"]
)

# Recalculate Calories per Gram
food_df_with_serving_size["Calories per Gram"] = (
    food_df_with_serving_size["Calories"] / food_df_with_serving_size["Serving Size (g)"]
)

# Replace infinite or NaN values
food_df_with_serving_size.replace([float('inf'), float('-inf')], 0, inplace=True)
food_df_with_serving_size.fillna(0, inplace=True)

# Display the updated DataFrame
print("Updated rows with previously zero Calories:")
print(food_df_with_serving_size.loc[non_salt_rows.index])


Updated rows with previously zero Calories:
              Description  Serving Size (g)  Calories  Protein  Carbohydrates  \
74             Beans  Dry              97.3    111.36    25.50            0.0   
75             Beans  Dry              97.3     95.64    21.30            0.0   
76             Beans  Dry              97.3    100.94    23.30            0.0   
77             Beans  Dry              97.3    112.48    25.60            0.0   
78             Beans  Dry              97.3    117.46    26.80            0.0   
..                    ...               ...       ...      ...            ...   
311   Sorghum bran  white              97.3    402.94    11.20           68.7   
312  Sorghum flour  white              97.3    363.96    10.20           73.5   
313  Sorghum grain  white              97.3    369.74    10.20           74.9   
314  Sorghum  whole grain              97.3    372.78    10.10           73.6   
315   Plantains  overripe              97.3    130.39     1.17   

In [14]:
# Filter rows where Protein and Fat are both zero
zero_protein_fat = food_df_with_serving_size[
    (food_df_with_serving_size["Protein"] == 0) &
    (food_df_with_serving_size["Fat"] == 0)
]

# Display these rows for review
print("Rows with zero Protein and zero Fat:")
print(zero_protein_fat)


Rows with zero Protein and zero Fat:
        Description  Serving Size (g)  Calories  Protein  Carbohydrates  Fat  \
61      Salt  table               6.1       0.0      0.0            0.0  0.0   
95      Oil  canola              90.9       0.0      0.0            0.0  0.0   
96        Oil  corn              91.3       0.0      0.0            0.0  0.0   
97     Oil  soybean              91.3       0.0      0.0            0.0  0.0   
98       Oil  olive              90.7       0.0      0.0            0.0  0.0   
126     Oil  peanut              97.3       0.0      0.0            0.0  0.0   
127  Oil  sunflower              97.3       0.0      0.0            0.0  0.0   
128  Oil  safflower              97.3       0.0      0.0            0.0  0.0   
129      Oil  olive              97.3       0.0      0.0            0.0  0.0   

     Calories per Gram  Protein per Gram  
61                 0.0               0.0  
95                 0.0               0.0  
96                 0.0           

## Drop Rows with All Zero Values

In [15]:
# Drop rows where Protein, Fat, and Calories are all zero
cleaned_df = food_df_with_serving_size[
    ~((food_df_with_serving_size["Protein"] == 0) &
      (food_df_with_serving_size["Fat"] == 0) &
      (food_df_with_serving_size["Calories"] == 0))
]

# Display the number of rows after cleaning
print(f"Number of rows after dropping invalid rows: {len(cleaned_df)}")
print(cleaned_df.head())


Number of rows after dropping invalid rows: 307
          Description  Serving Size (g)  Calories  Protein  Carbohydrates  \
0  Hummus  commercial              33.9     229.0     7.35          14.90   
1     Tomatoes  grape              49.7     113.0     0.83           5.51   
2         Beans  snap             129.0      86.0     1.04           4.11   
3   Frankfurter  beef              48.6    1310.0    11.70           2.89   
4       Nuts  almonds             135.0    2590.0    20.40          16.20   

     Fat  Calories per Gram  Protein per Gram  
0  17.10           6.755162          0.216814  
1   0.63           2.273642          0.016700  
2   0.39           0.666667          0.008062  
3  28.00          26.954733          0.240741  
4  57.80          19.185185          0.151111  


In [16]:
print(cleaned_df[
    (cleaned_df["Protein"] == 0) &
    (cleaned_df["Fat"] == 0) &
    (cleaned_df["Calories"] == 0)
])


Empty DataFrame
Columns: [Description, Serving Size (g), Calories, Protein, Carbohydrates, Fat, Calories per Gram, Protein per Gram]
Index: []


In [17]:
print(f"Rows before cleaning: {len(food_df_with_serving_size)}")
print(f"Rows after cleaning: {len(cleaned_df)}")


Rows before cleaning: 316
Rows after cleaning: 307


# Dividing foods in High-Protein, Low-Carb, High-Fat and Balanced.

In [18]:
# Define thresholds for macronutrient-based categorization
def macronutrient_category(row):
    if row["Protein"] > 15:
        return "High-Protein"
    elif row["Carbohydrates"] < 5:
        return "Low-Carb"
    elif row["Fat"] > 10:
        return "High-Fat"
    else:
        return "Balanced"

# Apply the function to assign macronutrient profiles
food_df_with_serving_size["Macronutrient Profile"] = food_df_with_serving_size.apply(macronutrient_category, axis=1)

# Display a sample of foods with macronutrient profiles
print(food_df_with_serving_size[["Description", "Macronutrient Profile"]].head())


          Description Macronutrient Profile
0  Hummus  commercial              High-Fat
1     Tomatoes  grape              Balanced
2         Beans  snap              Low-Carb
3   Frankfurter  beef              Low-Carb
4       Nuts  almonds          High-Protein


# Simple ChatBot to test

In [19]:
# Function to simulate chatbot responses
def chatbot_response(query):
    query = query.lower()

    if "high-protein" in query:
        result = food_df_with_serving_size[
            food_df_with_serving_size["Macronutrient Profile"] == "High-Protein"
        ]["Description"].tolist()
        return f"Here are some high-protein foods: {', '.join(result[:10])}..."

    elif "low-carb" in query:
        result = food_df_with_serving_size[
            food_df_with_serving_size["Macronutrient Profile"] == "Low-Carb"
        ]["Description"].tolist()
        return f"Here are some low-carb foods: {', '.join(result[:10])}..."

    elif "balanced" in query:
        result = food_df_with_serving_size[
            food_df_with_serving_size["Macronutrient Profile"] == "Balanced"
        ]["Description"].tolist()
        return f"Here are some foods with balanced macronutrients: {', '.join(result[:10])}..."

    elif "high-protein and low-carb" in query:
        result = food_df_with_serving_size[
            (food_df_with_serving_size["Macronutrient Profile"] == "High-Protein") &
            (food_df_with_serving_size["Carbohydrates"] < 5)
        ]["Description"].tolist()
        return f"Here are some high-protein and low-carb foods: {', '.join(result[:10])}..."

    else:
        return "I'm sorry, I didn't understand that. Can you ask about high-protein, low-carb, or balanced foods?"




In [20]:
# Test example queries
print(chatbot_response("What are some high-protein foods?"))
print(chatbot_response("List low-carb options."))
print(chatbot_response("What foods are balanced in macronutrients?"))
print(chatbot_response("Can you suggest high-protein and low-carb foods?"))

Here are some high-protein foods: Nuts  almonds, Egg  white, Cheese  parmesan, Cheese  pasteurized process, Seeds  sunflower seed kernels, Cheese  cheddar, Cheese  mozzarella, Egg  whole, Egg  yolk, Egg  yolk...
Here are some low-carb foods: Beans  snap, Frankfurter  beef, Kale  raw, Egg  whole, Egg  white, Pickles  cucumber, Cheese  cottage, Yogurt  Greek, Oil  coconut, Olives  green...
Here are some foods with balanced macronutrients: Tomatoes  grape, Grapefruit juice  white, Peaches  yellow, Bread  white, Kale  frozen, Mustard  prepared, Kiwifruit  green, Nectarines  raw, Yogurt  Greek, Sauce  pasta...
Here are some high-protein foods: Nuts  almonds, Egg  white, Cheese  parmesan, Cheese  pasteurized process, Seeds  sunflower seed kernels, Cheese  cheddar, Cheese  mozzarella, Egg  whole, Egg  yolk, Egg  yolk...


In [23]:
# More examples

print(chatbot_response("Tell me which high-protein food I can eat?"))
print(chatbot_response("List low-carb vegetables."))

Here are some high-protein foods: Nuts  almonds, Egg  white, Cheese  parmesan, Cheese  pasteurized process, Seeds  sunflower seed kernels, Cheese  cheddar, Cheese  mozzarella, Egg  whole, Egg  yolk, Egg  yolk...
Here are some low-carb foods: Beans  snap, Frankfurter  beef, Kale  raw, Egg  whole, Egg  white, Pickles  cucumber, Cheese  cottage, Yogurt  Greek, Oil  coconut, Olives  green...


# Increasing Base Knownledge


In [28]:
# Function to categorize foods based on calorie content
def caloric_density(row):
    if row["Calories per Gram"] > 2:
        return "High-Calorie"
    elif row["Calories per Gram"] > 1:
        return "Moderate-Calorie"
    else:
        return "Low-Calorie"

food_df_with_serving_size["Caloric Density"] = food_df_with_serving_size.apply(caloric_density, axis=1)

# Display a sample of the updated dataset
print("Sample of foods with Calorie Category:")
print(food_df_with_serving_size[["Description", "Calories", "Caloric Density"]].head())


Sample of foods with Calorie Category:
          Description  Calories Caloric Density
0  Hummus  commercial     229.0    High-Calorie
1     Tomatoes  grape     113.0    High-Calorie
2         Beans  snap      86.0     Low-Calorie
3   Frankfurter  beef    1310.0    High-Calorie
4       Nuts  almonds    2590.0    High-Calorie


In [29]:
# Function to categorize foods based on protein content
def protein_category(row):
    if row["Protein"] > 25:
        return "Very High-Protein"
    elif row["Protein"] > 15:
        return "High-Protein"
    elif row["Protein"] > 5:
        return "Moderate-Protein"
    else:
        return "Low-Protein"

# Apply the categorization function to the dataset
food_df_with_serving_size["Protein Category"] = food_df_with_serving_size.apply(protein_category, axis=1)

# Display a sample of the updated dataset
print("Sample of foods with Protein Category:")
print(food_df_with_serving_size[["Description", "Protein", "Protein Category"]].head())


Sample of foods with Protein Category:
          Description  Protein  Protein Category
0  Hummus  commercial     7.35  Moderate-Protein
1     Tomatoes  grape     0.83       Low-Protein
2         Beans  snap     1.04       Low-Protein
3   Frankfurter  beef    11.70  Moderate-Protein
4       Nuts  almonds    20.40      High-Protein


# Updated Chatbot

In [32]:
# Function to handle chatbot responses for macronutrient, protein, and caloric density queries
def chatbot_response(query):
    query = query.lower()

    # Macronutrient Profile Queries
    if "high-protein and low-carb" in query:
        result = food_df_with_serving_size[
            (food_df_with_serving_size["Macronutrient Profile"] == "High-Protein") &
            (food_df_with_serving_size["Carbohydrates"] < 5)
        ]["Description"].tolist()
        return f"Here are some high-protein and low-carb foods: {', '.join(result[:10])}..."

    elif "high-protein" in query:
        result = food_df_with_serving_size[
            food_df_with_serving_size["Macronutrient Profile"] == "High-Protein"
        ]["Description"].tolist()
        return f"Here are some high-protein foods: {', '.join(result[:10])}..."

    elif "low-carb" in query:
        result = food_df_with_serving_size[
            food_df_with_serving_size["Macronutrient Profile"] == "Low-Carb"
        ]["Description"].tolist()
        return f"Here are some low-carb foods: {', '.join(result[:10])}..."

    elif "balanced" in query:
        result = food_df_with_serving_size[
            food_df_with_serving_size["Macronutrient Profile"] == "Balanced"
        ]["Description"].tolist()
        return f"Here are some foods with balanced macronutrients: {', '.join(result[:10])}..."

    # Protein Category Queries
    elif "very high-protein" in query:
        result = food_df_with_serving_size[
            food_df_with_serving_size["Protein Category"] == "Very High-Protein"
        ]["Description"].tolist()
        return f"Here are some very high-protein foods: {', '.join(result[:10])}..."

    elif "high-protein" in query:
        result = food_df_with_serving_size[
            food_df_with_serving_size["Protein Category"] == "High-Protein"
        ]["Description"].tolist()
        return f"Here are some high-protein foods: {', '.join(result[:10])}..."

    elif "moderate-protein" in query:
        result = food_df_with_serving_size[
            food_df_with_serving_size["Protein Category"] == "Moderate-Protein"
        ]["Description"].tolist()
        return f"Here are some moderate-protein foods: {', '.join(result[:10])}..."

    elif "low-protein" in query:
        result = food_df_with_serving_size[
            food_df_with_serving_size["Protein Category"] == "Low-Protein"
        ]["Description"].tolist()
        return f"Here are some low-protein foods: {', '.join(result[:10])}..."

    # Caloric Density Queries
    elif "high-calorie" in query:
        result = food_df_with_serving_size[
            food_df_with_serving_size["Caloric Density"] == "High-Calorie"
        ]["Description"].tolist()
        return f"Here are some high-calorie foods: {', '.join(result[:10])}..."

    elif "moderate-calorie" in query:
        result = food_df_with_serving_size[
            food_df_with_serving_size["Caloric Density"] == "Moderate-Calorie"
        ]["Description"].tolist()
        return f"Here are some moderate-calorie foods: {', '.join(result[:10])}..."

    elif "low-calorie" in query:
        result = food_df_with_serving_size[
            food_df_with_serving_size["Caloric Density"] == "Low-Calorie"
        ]["Description"].tolist()
        return f"Here are some low-calorie foods: {', '.join(result[:10])}..."

    # Default Response
    else:
        return "I'm sorry, I didn't understand that. Can you ask about specific protein, macronutrient, or caloric density levels?"


In [33]:
# Test macronutrient profile queries
print(chatbot_response("What are some high-protein foods?"))
print(chatbot_response("List low-carb options."))
print(chatbot_response("What foods are balanced in macronutrients?"))
print(chatbot_response("Can you suggest high-protein and low-carb foods?"))

# Test protein category queries
print(chatbot_response("What are very high-protein foods?"))
print(chatbot_response("What foods are low in protein?"))

# Test calorie category queries
print(chatbot_response("What are some high-calorie foods?"))
print(chatbot_response("What are moderate-calorie foods?"))
print(chatbot_response("Can you suggest low-calorie foods?"))


Here are some high-protein foods: Nuts  almonds, Egg  white, Cheese  parmesan, Cheese  pasteurized process, Seeds  sunflower seed kernels, Cheese  cheddar, Cheese  mozzarella, Egg  whole, Egg  yolk, Egg  yolk...
Here are some low-carb foods: Beans  snap, Frankfurter  beef, Kale  raw, Egg  whole, Egg  white, Pickles  cucumber, Cheese  cottage, Yogurt  Greek, Oil  coconut, Olives  green...
Here are some foods with balanced macronutrients: Tomatoes  grape, Grapefruit juice  white, Peaches  yellow, Bread  white, Kale  frozen, Mustard  prepared, Kiwifruit  green, Nectarines  raw, Yogurt  Greek, Sauce  pasta...
Here are some high-protein and low-carb foods: Cheese  cheddar, Cheese  mozzarella, Egg  whole, Egg  yolk, Egg  yolk, Chicken  broilers or fryers, Chicken  broiler or fryers, Ham  sliced, Fish  haddock, Fish  tuna...
Here are some high-protein foods: Nuts  almonds, Egg  white, Cheese  parmesan, Cheese  pasteurized process, Seeds  sunflower seed kernels, Cheese  cheddar, Cheese  mozzar

# Fine-Tune GPT-2

Generate Conversational Training Data

In [34]:
# Function to generate conversational training data
def generate_conversational_data(df):
    training_data = []

    for _, row in df.iterrows():
        # Generate questions based on Macronutrient Profile
        if "Macronutrient Profile" in df.columns:
            if row["Macronutrient Profile"] == "High-Protein":
                question = "User: What are some high-protein foods?"
                answer = f"Bot: {row['Description']}."
                training_data.append(f"{question}\n{answer}<|endoftext|>")

            elif row["Macronutrient Profile"] == "Low-Carb":
                question = "User: What are some low-carb foods?"
                answer = f"Bot: {row['Description']}."
                training_data.append(f"{question}\n{answer}<|endoftext|>")

            elif row["Macronutrient Profile"] == "Balanced":
                question = "User: What foods are balanced in macronutrients?"
                answer = f"Bot: {row['Description']}."
                training_data.append(f"{question}\n{answer}<|endoftext|>")

        # Generate questions based on Caloric Density
        if "Caloric Density" in df.columns:
            if row["Caloric Density"] == "High-Calorie":
                question = "User: What are some high-calorie foods?"
                answer = f"Bot: {row['Description']}."
                training_data.append(f"{question}\n{answer}<|endoftext|>")

            elif row["Caloric Density"] == "Moderate-Calorie":
                question = "User: What are some moderate-calorie foods?"
                answer = f"Bot: {row['Description']}."
                training_data.append(f"{question}\n{answer}<|endoftext|>")

            elif row["Caloric Density"] == "Low-Calorie":
                question = "User: What are some low-calorie foods?"
                answer = f"Bot: {row['Description']}."
                training_data.append(f"{question}\n{answer}<|endoftext|>")

        # Generate questions based on other food categories (if applicable)
        if "Food Type" in df.columns:
            if row["Food Type"] == "Fruit":
                question = "User: What are some fruits?"
                answer = f"Bot: {row['Description']}."
                training_data.append(f"{question}\n{answer}<|endoftext|>")

    return training_data

# Apply the function to your dataset
training_texts = generate_conversational_data(food_df_with_serving_size)

# Save the training data to a text file
with open("conversational_training_data.txt", "w") as file:
    file.write("\n".join(training_texts))

print(f"Generated {len(training_texts)} conversational training examples.")


Generated 620 conversational training examples.


## Fine Tuning using generated conversational training examples

In [36]:
#pip install transformers datasets torch


In [37]:
from datasets import load_dataset

# Load the text dataset
dataset = load_dataset("text", data_files={"train": "conversational_training_data.txt"})


Generating train split: 0 examples [00:00, ? examples/s]

In [43]:
from transformers import GPT2Tokenizer

# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(examples):
    encodings = tokenizer(
        examples["text"],
        truncation=True,
        max_length=128,
        padding="max_length",  #
    )
    encodings["labels"] = encodings["input_ids"].copy()  # Add labels
    return encodings

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])


# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])


Map:   0%|          | 0/1240 [00:00<?, ? examples/s]

In [44]:
from transformers import GPT2LMHeadModel

# Load GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")


In [48]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments


# Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
    report_to=[],  # Disable wandb and other reporting tools
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    tokenizer=tokenizer,  # Ensure tokenizer includes pad_token
)

# Fine-tune the model
trainer.train()



  trainer = Trainer(


Step,Training Loss
100,0.0377
200,0.0339
300,0.0352
400,0.0313
500,0.0311
600,0.0324
700,0.0327
800,0.0381
900,0.0322
1000,0.0326


TrainOutput(global_step=1550, training_loss=0.032992726833589614, metrics={'train_runtime': 312.6605, 'train_samples_per_second': 19.83, 'train_steps_per_second': 4.957, 'total_flos': 405002649600000.0, 'train_loss': 0.032992726833589614, 'epoch': 5.0})

In [50]:
# Save the model after training
trainer.save_model("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")

('./gpt2-finetuned/tokenizer_config.json',
 './gpt2-finetuned/special_tokens_map.json',
 './gpt2-finetuned/vocab.json',
 './gpt2-finetuned/merges.txt',
 './gpt2-finetuned/added_tokens.json')

In [51]:
# Load the fine-tuned model
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model = GPT2LMHeadModel.from_pretrained("./gpt2-finetuned")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-finetuned")

def generate_response(prompt):
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    outputs = model.generate(inputs, max_length=50, num_return_sequences=1, temperature=0.7)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test the chatbot
print(generate_response("User: What are some high-protein foods?\nBot:"))
print(generate_response("User: What are low-calorie foods?\nBot:"))


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


User: What are some high-protein foods?
Bot: Beans  Dry.
User: What are low-calorie foods?
Bot: Beans  Dry.


In [54]:
# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./gpt2-finetuned")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-finetuned")

# Ensure pad_token is set
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 uses eos_token for padding

# Generate response with well-structured prompts and advanced generation settings
def generate_response(prompt):
    # Crafting the prompt for better user interaction
    structured_prompt = f"User: {prompt}\nBot:"

    # Tokenize and prepare the input
    inputs = tokenizer.encode(structured_prompt, return_tensors="pt", padding=True, truncation=True)
    attention_mask = inputs.ne(tokenizer.pad_token_id).to(inputs.device)  # Create attention mask

    # Generate response with temperature, sampling, and pad_token_id adjustments
    outputs = model.generate(
        inputs,
        attention_mask=attention_mask,
        max_length=50,  # Adjust the max length of the output response
        num_return_sequences=3,  # Number of responses to generate
        temperature=0.9,  # Control randomness (higher = more diverse output)
        do_sample=True,  # Enable sampling for varied responses
        pad_token_id=tokenizer.eos_token_id  # Use eos_token_id as pad_token to prevent issues
    )

    # Decode and return the response, skipping special tokens
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test the chatbot with different queries
print(generate_response("What are some high-protein foods?"))
print(generate_response("What are low-calorie foods?"))
print(generate_response("Can you suggest some balanced foods?"))

User: What are some high-protein foods?
Bot: Fish  cod.
User: What are low-calorie foods?
Bot: Almond milk  unsweetened.
User: Can you suggest some balanced foods?
Bot: Fish  haddock.


## GPT 2 Part

In [None]:
# pip install transformers datasets accelerate

In [None]:
# Convert your JSONL file to a dataset with input_text and target_text

import pandas as pd

# Load your JSONL fine-tuning data
data = []
with open("fine_tuning_data.jsonl", "r") as f:
    for line in f:
        data.append(json.loads(line))

# Convert to DataFrame for easier handling
df = pd.DataFrame(data)

# Save in Hugging Face-friendly format
df.to_csv("fine_tuning_data.csv", index=False)
print("Dataset saved as fine_tuning_data.csv!")


Dataset saved as fine_tuning_data.csv!


In [None]:
#pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
from datasets import load_dataset

# Load dataset
dataset = load_dataset("csv", data_files="fine_tuning_data.csv")
train_dataset = dataset["train"]

print(train_dataset[0])


Generating train split: 0 examples [00:00, ? examples/s]

{'prompt': 'I want to lose weight. Is Hummus, commercial good for me?', 'completion': 'Hummus, commercial is not good for weight loss. It has 6.76 kcal per gram.'}


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load fine-tuned model and tokenizer
model = AutoModelForCausalLM.from_pretrained("/content/fine_tuned_distilgpt2")
tokenizer = AutoTokenizer.from_pretrained("/content/fine_tuned_distilgpt2")

# Function to generate responses
def generate_response(prompt):
    # Tokenize input with padding and truncation
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=256)

    # Generate response with improved parameters
    outputs = model.generate(
        **inputs,
        max_length=150,
        temperature=0.7,
        do_sample=True,  # Enable sampling
        top_k=50,  # Limit to top 50 words
        top_p=0.9,  # Nucleus sampling
        repetition_penalty=2.0,  # Penalize repetition
        pad_token_id=tokenizer.pad_token_id  # Proper handling of padding
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test the model
prompt = "I want to lose weight. Is hummus good for me?"
response = generate_response(prompt)
print(response)



I want to lose weight. Is hummus good for me? No, but I'm going back on that diet because it's a great way to get the most out of my body and not have any issues."
"It works," said Pauline in response: "But don't try trying hard at all! You know what you're doing is just getting rid (of) fat cells; they are making up your muscle tissue!"


### Results not so good.

### Let's add structure to my prompts for better context.

## Prompt 2

### Added better context + clean response

In [None]:
def generate_response2(food, goal, question):

    # Create structured prompt
    prompt = f"Food: {food}\nGoal: {goal}\nQuestion: {question}"

    # Tokenize input with padding and truncation
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=256)

    # Generate response with improved parameters
    outputs = model.generate(
        **inputs,
        max_length=150,
        temperature=0.7,
        do_sample=True,
        top_k=50,
        top_p=0.9,
        repetition_penalty=2.0,
        pad_token_id=tokenizer.pad_token_id
    )

    # Decode the output
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response


In [None]:
# Test the function
response = generate_response2(
    food="Hummus",
    goal="Weight Loss",
    question="Is hummus good for me?"
)
print(response)


Food: Hummus
Goal: Weight Loss
Question: Is hummus good for me?

.I've had it and I'm happy with the results, but still think that if you're not eating a lot of food then your body won't be able to use up all its energy because there's just too much starch in this meal (which is why most people are using beans). If we eat enough grains or pasta every day instead so that our bodies can get used after having eaten whole foods before getting tired from them my weight will go down as well! It'll also help us lose fat faster than any other type diet . Also on top...that being said , don´t expect some kind mass gain during exercise - especially when compared


In [None]:
def clean_response(response):
    # Split response into sentences
    sentences = response.split(".")

    # Return the first two sentences for brevity
    cleaned_response = ". ".join(sentences[:2]).strip()
    return cleaned_response

# Test the clean-up
response = generate_response2(
    food="Hummus",
    goal="Weight Loss",
    question="Is hummus good for me?"
)
cleaned_response = clean_response(response)
print(cleaned_response)


Food: Hummus
Goal: Weight Loss
Question: Is hummus good for me? Answer: Well, I'm a big fan of the stuff it provides.  But when you're not trying to lose weight or make changes in your diet that would be great news! If so then this is an easy recipe with no need on my part (except maybe one) and can help keep people from getting sick over time if they don't want their health restored before long!!


### Testing Prompt 2

In [None]:
# Example 1
print(generate_response2(
    food="Bacon",
    goal="Weight Loss",
    question="Is bacon good for my diet?"
))

# Example 2
print(generate_response2(
    food="Eggs",
    goal="Muscle Gain",
    question="How much protein do eggs have?"
))

# Example 3
print(generate_response2(
    food="Chicken",
    goal="Weight Loss",
    question="Is chicken low in calories?"
))


Food: Bacon
Goal: Weight Loss
Question: Is bacon good for my diet? Answer to question 1 of 3. (Click here to download)


Food: Eggs
Goal: Muscle Gain
Question: How much protein do eggs have? Can they help you lose weight and maintain your healthy body mass, or is it something that we should avoid doing at all costs. (Answer) We can't give up on our natural eating habits if there's not enough evidence to support those claims! In fact this has been shown in some studies where women were given a diet consisting of 8% carbohydrate instead thereof which caused them considerable pain for several weeks after the experiment was started due simply because their bodies didn´t metabolize carbs properly by themselves - although I think even more research needs being done regarding how fat intake affects muscle growth... Also please consider what kind "caffeine" foods are available from reputable
Food: Chicken
Goal: Weight Loss
Question: Is chicken low in calories? Answer : Yes. However, the high a

Analysis of Issues

Verbose and Unstructured Output: the model generates overly long and irrelevant sentences because it lacks focus. The prompt structure is not effectively guiding the model's behavior.

Lack of Domain-Specific Knowledge:the model relies too heavily on pre-trained general knowledge rather than fine-tuned task-specific examples.

Unclear Training Signal: the fine-tuning data may not have enough diversity or explicit examples to enforce concise, factual, and relevant outputs.

# Prompt 3

### Added explicit instructions

In [None]:
def generate_response3(food, goal, question):
    # Create structured and constrained prompt
    prompt = (
        "Here is an example:\n"
        "Food: Bacon\n"
        "Goal: Weight Loss\n"
        "Question: Is bacon good for weight loss?\n"
        "Answer: No, bacon is high in calories and fat, making it a poor choice for weight loss.\n\n"
        "Now, answer the following:\n"
        f"Food: {food}\n"
        f"Goal: {goal}\n"
        f"Question: Is {food} good for {goal}?\n"
        "Answer:"
    )

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=256)

    # Generate response with constraints
    outputs = model.generate(
        **inputs,
        max_length=150,
        temperature=0.7,
        do_sample=True,
        top_k=50,
        top_p=0.9,
        repetition_penalty=2.5,
        pad_token_id=tokenizer.pad_token_id
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [None]:
def clean_response2(response):
    # Extract sentences and look for a clear answer
    sentences = response.split(". ")

    # Return only the first sentence containing the key context
    for sentence in sentences:
        if "calorie" in sentence.lower() or "protein" in sentence.lower() or "fat" in sentence.lower():
            return sentence.strip()

    # Fallback: Return the first sentence if no key context is found
    return sentences[0].strip()


In [None]:
# Test the function with the new structure
response = generate_response3(
    food="Hummus",
    goal="Weight Loss",
    question="Is hummus good for me?"
)
cleaned_response = clean_response2(response)
print("Cleaned Response:", cleaned_response)


Cleaned Response: Here is an example:
Food: Bacon
Goal: Weight Loss
Question: Is bacon good for weight loss?
Answer: No, bacon is high in calories and fat, making it a poor choice for weight loss.

Now, answer the following:
Food: Hummus
Goal: Weight Loss
Question: Is Hummus good for Weight Loss?
Answer: Yes – I don't think so! It's not even close to that level of calorie restriction you're talking about here…it really isn`t like eating too much food at once or anything!! All we have left are two options (no more than 2 meals per day) with one being very low-carb while another could be either lean meatless veggies


## Prompt 4

### Added facts

In [None]:
def generate_response_with_facts(food, facts, goal):
    # Add structured prompt with facts
    prompt = (
        f"Food: {food}\n"
        f"Facts: {facts}\n"
        f"Goal: {goal}\n"
        f"Question: Is {food} good for {goal}?\n"
        "Answer:"
    )


    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=256)

    # Generate response
    outputs = model.generate(
        **inputs,
        max_length=100,
        temperature=0.7,
        do_sample=True,
        top_k=50,
        top_p=0.9,
        repetition_penalty=2.5,
        pad_token_id=tokenizer.pad_token_id
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test with factual data
facts = "Hummus contains 166 calories and 7.9g of protein per serving."
response = generate_response_with_facts(food="Hummus", facts=facts, goal="Weight Loss")
print("Response:", response)


Response: Food: Hummus
Facts: Hummus contains 166 calories and 7.9g of protein per serving.
Goal: Weight Loss
Question: Is Hummus good for Weight Loss?
Answer: It does not cause any weight loss, but it may lead to a temporary decrease in your body's ability "to digest fats" or produce energy that is lost when you lose fat (in other words gain muscle mass). In addition there are various types available which will help reduce the amount spent


In [None]:
def clean_response_contextual(response):
    # Split response into sentences
    sentences = response.split(". ")

    # Look for key context in sentences
    for sentence in sentences:
        if any(keyword in sentence.lower() for keyword in ["yes", "no", "calorie", "protein", "fat"]):
            return sentence.strip()

    # Default: Return the first sentence
    return sentences[0].strip()


In [None]:
facts = "Hummus contains 166 calories and 7.9g of protein per serving."
response = generate_response_with_facts(food="Hummus", facts=facts, goal="Weight Loss")
cleaned_response = clean_response_contextual(response)
print("Cleaned Response:", cleaned_response)


Cleaned Response: Food: Hummus
Facts: Hummus contains 166 calories and 7.9g of protein per serving.
Goal: Weight Loss
Question: Is Hummus good for Weight Loss?
Answer: It is an excellent source, particularly if you're looking to lose weight in a healthy way or when working out hard (no one knows how it works)


### Facts really helped the answers.

In [None]:
import json

# Load dataset
with open("foundationDownload.json", "r") as f:
    data = json.load(f)

# Extract the list of foods
foods = data["FoundationFoods"]

# Function to extract key nutrients
def process_food_data(foods):
    processed_data = []
    for food in foods:
        description = food.get("description", "Unknown")
        nutrients = food.get("foodNutrients", [])

        # Extract nutrient amounts
        nutrient_dict = {n["nutrient"]["name"]: n["amount"] for n in nutrients if "amount" in n}

        # Extract relevant nutrients
        important_nutrients = {
            "Description": description,
            "Calories": nutrient_dict.get("Energy", 0),
            "Protein": nutrient_dict.get("Protein", 0),
            "Fat": nutrient_dict.get("Total lipid (fat)", 0),
        }
        processed_data.append(important_nutrients)

    return processed_data

# Process food data
processed_foods = process_food_data(foods)




In [None]:
processed_foods[0]

{'Description': 'Hummus, commercial',
 'Calories': 229,
 'Protein': 7.35,
 'Fat': 17.1}

In [None]:
def generate_prompts(processed_foods):
    prompts = []
    for food in processed_foods:
        description = food["Description"]
        calories = food["Calories"]
        protein = food["Protein"]
        fat = food["Fat"]

        # Generate prompt for weight loss
        weight_loss_prompt = {
            "prompt": f"Food: {description}\nFacts: {calories} calories, {protein}g protein, {fat}g fat per serving.\nGoal: Weight Loss\nQuestion: Is {description} good for weight loss?\nAnswer:",
            "completion": f" Yes, {description} is low in calories and high in protein, making it a good choice for weight loss when eaten in moderation."
        }

        # Generate prompt for muscle gain
        muscle_gain_prompt = {
            "prompt": f"Food: {description}\nFacts: {calories} calories, {protein}g protein, {fat}g fat per serving.\nGoal: Muscle Gain\nQuestion: Is {description} good for muscle gain?\nAnswer:",
            "completion": f" Yes, {description} is high in protein, making it an excellent choice for muscle gain."
        }

        prompts.extend([weight_loss_prompt, muscle_gain_prompt])

    return prompts

# Generate prompts
prompts = generate_prompts(processed_foods)

# Save to JSONL file
with open("fine_tuning_prompts.jsonl", "w") as f:
    for item in prompts:
        json.dump(item, f)
        f.write("\n")

print("Prompts saved to fine_tuning_prompts.jsonl")


Prompts saved to fine_tuning_prompts.jsonl


In [None]:
from datasets import load_dataset

# Load the JSONL dataset
dataset = load_dataset("json", data_files={"train": "fine_tuning_prompts.jsonl"})

# Split into train and validation sets (90% train, 10% validation)
dataset = dataset["train"].train_test_split(test_size=0.1)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

print(train_dataset[0])  # Check the structure


FileNotFoundError: Unable to find '/content/fine_tuning_prompts.jsonl'

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load pre-trained model and tokenizer
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Add padding token if not present
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))


Embedding(50258, 768)

In [None]:
def tokenize_function(examples):
    # Combine prompt and completion with a separator
    full_text = [f"{prompt} [SEP] {completion}" for prompt, completion in zip(examples["prompt"], examples["completion"])]
    return tokenizer(
        full_text,
        truncation=True,  # Truncate sequences longer than max_length
        max_length=512,  # Ensure consistent token lengths
        padding="max_length"  # Add padding for consistent lengths
    )



In [None]:
# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns
train_dataset = train_dataset.remove_columns(["prompt", "completion"])
eval_dataset = eval_dataset.remove_columns(["prompt", "completion"])

# Set format for PyTorch
train_dataset.set_format("torch")
eval_dataset.set_format("torch")

NameError: name 'train_dataset' is not defined