## NLP PROJECT #2

### Student: Jefferson Roesler

# Dataset

In [31]:
import json
import pandas as pd

# Load the JSON file
with open('foundationDownload.json', 'r') as file:
    data = json.load(file)

In [32]:
# Extract the list of foods
foods = data["FoundationFoods"]

# Flatten the data and include serving size
def process_food_data_with_serving_size(foods):
    processed_data = []
    for food in foods:
        # Extract food description
        description = food.get("description", "Unknown")

        # Extract serving size (from 'foodPortions')
        food_portions = food.get("foodPortions", [])
        if food_portions:
            # Assume the first portion is the standard serving size
            serving_size = food_portions[0].get("gramWeight", 0)  # Weight in grams
        else:
            serving_size = 0  # Default if no portion info available

        # Extract nutrients
        nutrients = food.get("foodNutrients", [])
        nutrient_dict = {n["nutrient"]["name"]: n["amount"] for n in nutrients if "amount" in n}

        # Keep only key nutrients and serving size
        important_nutrients = {
            "Description": description,
            "Serving Size (g)": serving_size,
            "Calories": nutrient_dict.get("Energy", 0),
            "Protein": nutrient_dict.get("Protein", 0),
            "Carbohydrates": nutrient_dict.get("Carbohydrate, by difference", 0),
            "Fat": nutrient_dict.get("Total lipid (fat)", 0)
        }
        processed_data.append(important_nutrients)

    return processed_data

# Process the food data with serving size
processed_foods_with_serving_size = process_food_data_with_serving_size(foods)

# Convert to a Pandas DataFrame for easier handling
food_df_with_serving_size = pd.DataFrame(processed_foods_with_serving_size)

# Display the first few rows
print(food_df_with_serving_size.head())


                                         Description  Serving Size (g)  \
0                                 Hummus, commercial              33.9   
1                               Tomatoes, grape, raw              49.7   
2  Beans, snap, green, canned, regular pack, drai...             129.0   
3                        Frankfurter, beef, unheated              48.6   
4        Nuts, almonds, dry roasted, with salt added             135.0   

   Calories  Protein  Carbohydrates    Fat  
0     229.0     7.35          14.90  17.10  
1     113.0     0.83           5.51   0.63  
2      86.0     1.04           4.11   0.39  
3    1310.0    11.70           2.89  28.00  
4    2590.0    20.40          16.20  57.80  


## Add columns for calories/protein per gram.


In [33]:
# # Add new columns for calories and protein per gram
# food_df_with_serving_size["Calories per Gram"] = food_df_with_serving_size["Calories"] / food_df_with_serving_size["Serving Size (g)"]
# food_df_with_serving_size["Protein per Gram"] = food_df_with_serving_size["Protein"] / food_df_with_serving_size["Serving Size (g)"]

# # Replace infinite or NaN values (e.g., where serving size is 0)
# food_df_with_serving_size.replace([float('inf'), float('-inf')], 0, inplace=True)
# food_df_with_serving_size.fillna(0, inplace=True)

# # Display the updated DataFrame
# print(food_df_with_serving_size.head())


## New Function to add Protein and Calories columns

In [34]:
# Replace zero Serving Size (g) with NaN for proper handling
food_df_with_serving_size["Serving Size (g)"] = food_df_with_serving_size["Serving Size (g)"].replace(0, pd.NA)

# Replace NaN in Serving Size (g) with median or group-based imputed value if not already done
# Example: Global median
median_serving_size = food_df_with_serving_size["Serving Size (g)"].median()
food_df_with_serving_size["Serving Size (g)"].fillna(median_serving_size, inplace=True)

# Recalculate Calories per Gram
food_df_with_serving_size["Calories per Gram"] = food_df_with_serving_size["Calories"] / food_df_with_serving_size["Serving Size (g)"]

# Recalculate Protein per Gram
food_df_with_serving_size["Protein per Gram"] = food_df_with_serving_size["Protein"] / food_df_with_serving_size["Serving Size (g)"]

# Replace infinite or NaN values in the derived columns
food_df_with_serving_size.replace([float('inf'), float('-inf')], 0, inplace=True)
food_df_with_serving_size.fillna(0, inplace=True)

# Display the updated DataFrame
print("Updated DataFrame with recalculated values:")
print(food_df_with_serving_size[["Description", "Serving Size (g)", "Calories per Gram", "Protein per Gram"]].head())


Updated DataFrame with recalculated values:
                                         Description  Serving Size (g)  \
0                                 Hummus, commercial              33.9   
1                               Tomatoes, grape, raw              49.7   
2  Beans, snap, green, canned, regular pack, drai...             129.0   
3                        Frankfurter, beef, unheated              48.6   
4        Nuts, almonds, dry roasted, with salt added             135.0   

   Calories per Gram  Protein per Gram  
0           6.755162          0.216814  
1           2.273642          0.016700  
2           0.666667          0.008062  
3          26.954733          0.240741  
4          19.185185          0.151111  


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  food_df_with_serving_size["Serving Size (g)"].fillna(median_serving_size, inplace=True)
  food_df_with_serving_size["Serving Size (g)"].fillna(median_serving_size, inplace=True)


# Preprocessing

## Handling Food Description

Description is too long and has useless info. I want to help the chatbot find later which food the user is talking about.

In [35]:
# Function to simplify and remove commas in food descriptions
def simplify_description(description):
    # Split by commas and join the first two parts without commas
    parts = description.split(",")
    return " ".join(parts[:2]).strip() if len(parts) > 1 else description.strip()

# Apply the simplification function to the Description column
food_df_with_serving_size["Description"] = food_df_with_serving_size["Description"].apply(simplify_description)

# Display the updated DataFrame
print(food_df_with_serving_size.head())




          Description  Serving Size (g)  Calories  Protein  Carbohydrates  \
0  Hummus  commercial              33.9     229.0     7.35          14.90   
1     Tomatoes  grape              49.7     113.0     0.83           5.51   
2         Beans  snap             129.0      86.0     1.04           4.11   
3   Frankfurter  beef              48.6    1310.0    11.70           2.89   
4       Nuts  almonds             135.0    2590.0    20.40          16.20   

     Fat  Calories per Gram  Protein per Gram  
0  17.10           6.755162          0.216814  
1   0.63           2.273642          0.016700  
2   0.39           0.666667          0.008062  
3  28.00          26.954733          0.240741  
4  57.80          19.185185          0.151111  


In [36]:
# Display all columns in the DataFrame
print("Columns in the dataset:")
print(food_df_with_serving_size.columns)

# Optionally, display the first few rows to inspect the data
print("\nSample data:")
print(food_df_with_serving_size.head())


Columns in the dataset:
Index(['Description', 'Serving Size (g)', 'Calories', 'Protein',
       'Carbohydrates', 'Fat', 'Calories per Gram', 'Protein per Gram'],
      dtype='object')

Sample data:
          Description  Serving Size (g)  Calories  Protein  Carbohydrates  \
0  Hummus  commercial              33.9     229.0     7.35          14.90   
1     Tomatoes  grape              49.7     113.0     0.83           5.51   
2         Beans  snap             129.0      86.0     1.04           4.11   
3   Frankfurter  beef              48.6    1310.0    11.70           2.89   
4       Nuts  almonds             135.0    2590.0    20.40          16.20   

     Fat  Calories per Gram  Protein per Gram  
0  17.10           6.755162          0.216814  
1   0.63           2.273642          0.016700  
2   0.39           0.666667          0.008062  
3  28.00          26.954733          0.240741  
4  57.80          19.185185          0.151111  


## Missing Values

In [37]:
# Check for missing values
missing_values = food_df_with_serving_size.isnull().sum()

# Calculate the percentage of missing values
missing_percentage = (missing_values / len(food_df_with_serving_size)) * 100

# Combine into a DataFrame for better readability
missing_summary = pd.DataFrame({
    "Column": food_df_with_serving_size.columns,
    "Missing Values": missing_values,
    "Percentage (%)": missing_percentage
}).sort_values(by="Percentage (%)", ascending=False)

# Display the missing value summary
print(missing_summary)


                              Column  Missing Values  Percentage (%)
Description              Description               0             0.0
Serving Size (g)    Serving Size (g)               0             0.0
Calories                    Calories               0             0.0
Protein                      Protein               0             0.0
Carbohydrates          Carbohydrates               0             0.0
Fat                              Fat               0             0.0
Calories per Gram  Calories per Gram               0             0.0
Protein per Gram    Protein per Gram               0             0.0


## Zero Values and Outliers

In [38]:
# Count rows with Serving Size (g) equal to zero
zero_serving_size_count = (food_df_with_serving_size["Serving Size (g)"] == 0).sum()
print(f"Number of foods with zero serving size: {zero_serving_size_count}")


Number of foods with zero serving size: 0


In [39]:
# Check for zero values in numeric columns
zero_values_summary = (food_df_with_serving_size == 0).sum()

# Calculate the percentage of zero values
zero_values_percentage = (zero_values_summary / len(food_df_with_serving_size)) * 100

# Combine into a DataFrame for readability
zero_summary = pd.DataFrame({
    "Column": food_df_with_serving_size.columns,
    "Zero Values": zero_values_summary,
    "Percentage (%)": zero_values_percentage
}).sort_values(by="Percentage (%)", ascending=False)

# Display the zero value summary
print("Zero Value Summary:")
print(zero_summary)



Zero Value Summary:
                              Column  Zero Values  Percentage (%)
Calories                    Calories          219       69.303797
Calories per Gram  Calories per Gram          219       69.303797
Carbohydrates          Carbohydrates           59       18.670886
Protein                      Protein           14        4.430380
Protein per Gram    Protein per Gram           14        4.430380
Fat                              Fat           10        3.164557
Description              Description            0        0.000000
Serving Size (g)    Serving Size (g)            0        0.000000


## Handling Calories Column = zero

In [40]:
# Filter rows where Calories is equal to zero
zero_calories_rows = food_df_with_serving_size[food_df_with_serving_size["Calories"] == 0]

# Display the rows with zero Calories
print("Rows with zero Calories:")
print(zero_calories_rows)


Rows with zero Calories:
              Description  Serving Size (g)  Calories  Protein  Carbohydrates  \
61            Salt  table               6.1       0.0     0.00            0.0   
74             Beans  Dry              97.3       0.0    25.50            0.0   
75             Beans  Dry              97.3       0.0    21.30            0.0   
76             Beans  Dry              97.3       0.0    23.30            0.0   
77             Beans  Dry              97.3       0.0    25.60            0.0   
..                    ...               ...       ...      ...            ...   
311   Sorghum bran  white              97.3       0.0    11.20           68.7   
312  Sorghum flour  white              97.3       0.0    10.20           73.5   
313  Sorghum grain  white              97.3       0.0    10.20           74.9   
314  Sorghum  whole grain              97.3       0.0    10.10           73.6   
315   Plantains  overripe              97.3       0.0     1.17           29.2   

  

In [41]:
# Display the first 10 rows with zero Calories
print(zero_calories_rows.head(10))

# Optionally, export the filtered rows to a CSV for analysis
zero_calories_rows.to_csv("zero_calories_rows.csv", index=False)


    Description  Serving Size (g)  Calories  Protein  Carbohydrates   Fat  \
61  Salt  table               6.1       0.0      0.0            0.0  0.00   
74   Beans  Dry              97.3       0.0     25.5            0.0  1.04   
75   Beans  Dry              97.3       0.0     21.3            0.0  1.16   
76   Beans  Dry              97.3       0.0     23.3            0.0  0.86   
77   Beans  Dry              97.3       0.0     25.6            0.0  1.12   
78   Beans  Dry              97.3       0.0     26.8            0.0  1.14   
79   Beans  Dry              97.3       0.0     24.6            0.0  1.28   
80   Beans  Dry              97.3       0.0     25.2            0.0  1.44   
81   Beans  Dry              97.3       0.0     24.4            0.0  1.23   
82   Beans  Dry              97.3       0.0     25.0            0.0  1.03   

    Calories per Gram  Protein per Gram  
61                0.0          0.000000  
74                0.0          0.262076  
75                0.0     

1. Validate Zero Calories:
Foods like salt are valid with zero calories, so these can be excluded from further processing.
For the remaining rows, we can estimate calories based on macronutrient values using the standard formula:

Calories
=
4
×

Protein (g)
+
4
×

Carbohydrates (g)
+
9
×
Fat (g)

Calories=4×Protein (g)+4×Carbohydrates (g)+9×Fat (g)

2. Fill Missing Calories:
Replace zero calorie values with the calculated estimates.


In [42]:
# Identify rows where Calories is zero but Protein, Carbohydrates, or Fat are non-zero
non_salt_rows = food_df_with_serving_size[
    (food_df_with_serving_size["Calories"] == 0) &
    ((food_df_with_serving_size["Protein"] > 0) |
     (food_df_with_serving_size["Carbohydrates"] > 0) |
     (food_df_with_serving_size["Fat"] > 0))
]

# Calculate calories using the macronutrient formula
food_df_with_serving_size.loc[non_salt_rows.index, "Calories"] = (
    4 * food_df_with_serving_size.loc[non_salt_rows.index, "Protein"] +
    4 * food_df_with_serving_size.loc[non_salt_rows.index, "Carbohydrates"] +
    9 * food_df_with_serving_size.loc[non_salt_rows.index, "Fat"]
)

# Recalculate Calories per Gram
food_df_with_serving_size["Calories per Gram"] = (
    food_df_with_serving_size["Calories"] / food_df_with_serving_size["Serving Size (g)"]
)

# Replace infinite or NaN values
food_df_with_serving_size.replace([float('inf'), float('-inf')], 0, inplace=True)
food_df_with_serving_size.fillna(0, inplace=True)

# Display the updated DataFrame
print("Updated rows with previously zero Calories:")
print(food_df_with_serving_size.loc[non_salt_rows.index])


Updated rows with previously zero Calories:
              Description  Serving Size (g)  Calories  Protein  Carbohydrates  \
74             Beans  Dry              97.3    111.36    25.50            0.0   
75             Beans  Dry              97.3     95.64    21.30            0.0   
76             Beans  Dry              97.3    100.94    23.30            0.0   
77             Beans  Dry              97.3    112.48    25.60            0.0   
78             Beans  Dry              97.3    117.46    26.80            0.0   
..                    ...               ...       ...      ...            ...   
311   Sorghum bran  white              97.3    402.94    11.20           68.7   
312  Sorghum flour  white              97.3    363.96    10.20           73.5   
313  Sorghum grain  white              97.3    369.74    10.20           74.9   
314  Sorghum  whole grain              97.3    372.78    10.10           73.6   
315   Plantains  overripe              97.3    130.39     1.17   

In [43]:
# Filter rows where Protein and Fat are both zero
zero_protein_fat = food_df_with_serving_size[
    (food_df_with_serving_size["Protein"] == 0) &
    (food_df_with_serving_size["Fat"] == 0)
]

# Display these rows for review
print("Rows with zero Protein and zero Fat:")
print(zero_protein_fat)


Rows with zero Protein and zero Fat:
        Description  Serving Size (g)  Calories  Protein  Carbohydrates  Fat  \
61      Salt  table               6.1       0.0      0.0            0.0  0.0   
95      Oil  canola              90.9       0.0      0.0            0.0  0.0   
96        Oil  corn              91.3       0.0      0.0            0.0  0.0   
97     Oil  soybean              91.3       0.0      0.0            0.0  0.0   
98       Oil  olive              90.7       0.0      0.0            0.0  0.0   
126     Oil  peanut              97.3       0.0      0.0            0.0  0.0   
127  Oil  sunflower              97.3       0.0      0.0            0.0  0.0   
128  Oil  safflower              97.3       0.0      0.0            0.0  0.0   
129      Oil  olive              97.3       0.0      0.0            0.0  0.0   

     Calories per Gram  Protein per Gram  
61                 0.0               0.0  
95                 0.0               0.0  
96                 0.0           

## Drop Rows with All Zero Values

In [44]:
# Drop rows where Protein, Fat, and Calories are all zero
cleaned_df = food_df_with_serving_size[
    ~((food_df_with_serving_size["Protein"] == 0) &
      (food_df_with_serving_size["Fat"] == 0) &
      (food_df_with_serving_size["Calories"] == 0))
]

# Display the number of rows after cleaning
print(f"Number of rows after dropping invalid rows: {len(cleaned_df)}")
print(cleaned_df.head())


Number of rows after dropping invalid rows: 307
          Description  Serving Size (g)  Calories  Protein  Carbohydrates  \
0  Hummus  commercial              33.9     229.0     7.35          14.90   
1     Tomatoes  grape              49.7     113.0     0.83           5.51   
2         Beans  snap             129.0      86.0     1.04           4.11   
3   Frankfurter  beef              48.6    1310.0    11.70           2.89   
4       Nuts  almonds             135.0    2590.0    20.40          16.20   

     Fat  Calories per Gram  Protein per Gram  
0  17.10           6.755162          0.216814  
1   0.63           2.273642          0.016700  
2   0.39           0.666667          0.008062  
3  28.00          26.954733          0.240741  
4  57.80          19.185185          0.151111  


In [45]:
print(cleaned_df[
    (cleaned_df["Protein"] == 0) &
    (cleaned_df["Fat"] == 0) &
    (cleaned_df["Calories"] == 0)
])


Empty DataFrame
Columns: [Description, Serving Size (g), Calories, Protein, Carbohydrates, Fat, Calories per Gram, Protein per Gram]
Index: []


In [46]:
print(f"Rows before cleaning: {len(food_df_with_serving_size)}")
print(f"Rows after cleaning: {len(cleaned_df)}")


Rows before cleaning: 316
Rows after cleaning: 307


# Dividing foods in High-Protein, Low-Carb, High-Fat and Balanced.

In [47]:
# Define thresholds for macronutrient-based categorization
def macronutrient_category(row):
    if row["Protein"] > 15:
        return "High-Protein"
    elif row["Carbohydrates"] < 5:
        return "Low-Carb"
    elif row["Fat"] > 10:
        return "High-Fat"
    else:
        return "Balanced"

# Apply the function to assign macronutrient profiles
food_df_with_serving_size["Macronutrient Profile"] = food_df_with_serving_size.apply(macronutrient_category, axis=1)

# Display a sample of foods with macronutrient profiles
print(food_df_with_serving_size[["Description", "Macronutrient Profile"]].head())


          Description Macronutrient Profile
0  Hummus  commercial              High-Fat
1     Tomatoes  grape              Balanced
2         Beans  snap              Low-Carb
3   Frankfurter  beef              Low-Carb
4       Nuts  almonds          High-Protein


# Simple ChatBot to test

In [48]:
# Function to simulate chatbot responses
def chatbot_response(query):
    query = query.lower()

    if "high-protein" in query:
        result = food_df_with_serving_size[
            food_df_with_serving_size["Macronutrient Profile"] == "High-Protein"
        ]["Description"].tolist()
        return f"Here are some high-protein foods: {', '.join(result[:10])}..."

    elif "low-carb" in query:
        result = food_df_with_serving_size[
            food_df_with_serving_size["Macronutrient Profile"] == "Low-Carb"
        ]["Description"].tolist()
        return f"Here are some low-carb foods: {', '.join(result[:10])}..."

    elif "balanced" in query:
        result = food_df_with_serving_size[
            food_df_with_serving_size["Macronutrient Profile"] == "Balanced"
        ]["Description"].tolist()
        return f"Here are some foods with balanced macronutrients: {', '.join(result[:10])}..."

    elif "high-protein and low-carb" in query:
        result = food_df_with_serving_size[
            (food_df_with_serving_size["Macronutrient Profile"] == "High-Protein") &
            (food_df_with_serving_size["Carbohydrates"] < 5)
        ]["Description"].tolist()
        return f"Here are some high-protein and low-carb foods: {', '.join(result[:10])}..."

    else:
        return "I'm sorry, I didn't understand that. Can you ask about high-protein, low-carb, or balanced foods?"




In [49]:
# Test example queries
print(chatbot_response("What are some high-protein foods?"))
print(chatbot_response("List low-carb options."))
print(chatbot_response("What foods are balanced in macronutrients?"))
print(chatbot_response("Can you suggest high-protein and low-carb foods?"))

Here are some high-protein foods: Nuts  almonds, Egg  white, Cheese  parmesan, Cheese  pasteurized process, Seeds  sunflower seed kernels, Cheese  cheddar, Cheese  mozzarella, Egg  whole, Egg  yolk, Egg  yolk...
Here are some low-carb foods: Beans  snap, Frankfurter  beef, Kale  raw, Egg  whole, Egg  white, Pickles  cucumber, Cheese  cottage, Yogurt  Greek, Oil  coconut, Olives  green...
Here are some foods with balanced macronutrients: Tomatoes  grape, Grapefruit juice  white, Peaches  yellow, Bread  white, Kale  frozen, Mustard  prepared, Kiwifruit  green, Nectarines  raw, Yogurt  Greek, Sauce  pasta...
Here are some high-protein foods: Nuts  almonds, Egg  white, Cheese  parmesan, Cheese  pasteurized process, Seeds  sunflower seed kernels, Cheese  cheddar, Cheese  mozzarella, Egg  whole, Egg  yolk, Egg  yolk...


In [50]:
# More examples

print(chatbot_response("Tell me which high-protein food I can eat?"))
print(chatbot_response("List low-carb vegetables."))

Here are some high-protein foods: Nuts  almonds, Egg  white, Cheese  parmesan, Cheese  pasteurized process, Seeds  sunflower seed kernels, Cheese  cheddar, Cheese  mozzarella, Egg  whole, Egg  yolk, Egg  yolk...
Here are some low-carb foods: Beans  snap, Frankfurter  beef, Kale  raw, Egg  whole, Egg  white, Pickles  cucumber, Cheese  cottage, Yogurt  Greek, Oil  coconut, Olives  green...


# Increasing Base Knownledge


In [51]:
# Function to categorize foods based on calorie content
def caloric_density(row):
    if row["Calories per Gram"] > 2:
        return "High-Calorie"
    elif row["Calories per Gram"] > 1:
        return "Moderate-Calorie"
    else:
        return "Low-Calorie"

food_df_with_serving_size["Caloric Density"] = food_df_with_serving_size.apply(caloric_density, axis=1)

# Display a sample of the updated dataset
print("Sample of foods with Calorie Category:")
print(food_df_with_serving_size[["Description", "Calories", "Caloric Density"]].head())


Sample of foods with Calorie Category:
          Description  Calories Caloric Density
0  Hummus  commercial     229.0    High-Calorie
1     Tomatoes  grape     113.0    High-Calorie
2         Beans  snap      86.0     Low-Calorie
3   Frankfurter  beef    1310.0    High-Calorie
4       Nuts  almonds    2590.0    High-Calorie


In [52]:
# Function to categorize foods based on protein content
def protein_category(row):
    if row["Protein"] > 25:
        return "Very High-Protein"
    elif row["Protein"] > 15:
        return "High-Protein"
    elif row["Protein"] > 5:
        return "Moderate-Protein"
    else:
        return "Low-Protein"

# Apply the categorization function to the dataset
food_df_with_serving_size["Protein Category"] = food_df_with_serving_size.apply(protein_category, axis=1)

# Display a sample of the updated dataset
print("Sample of foods with Protein Category:")
print(food_df_with_serving_size[["Description", "Protein", "Protein Category"]].head())


Sample of foods with Protein Category:
          Description  Protein  Protein Category
0  Hummus  commercial     7.35  Moderate-Protein
1     Tomatoes  grape     0.83       Low-Protein
2         Beans  snap     1.04       Low-Protein
3   Frankfurter  beef    11.70  Moderate-Protein
4       Nuts  almonds    20.40      High-Protein


In [53]:
print("Columns in the dataset:")
cleaned_df.columns

Columns in the dataset:


Index(['Description', 'Serving Size (g)', 'Calories', 'Protein',
       'Carbohydrates', 'Fat', 'Calories per Gram', 'Protein per Gram'],
      dtype='object')

# Updated Chatbot

In [54]:
# Function to handle chatbot responses for macronutrient, protein, and caloric density queries
def chatbot_response(query):
    query = query.lower()

    # Macronutrient Profile Queries
    if "high-protein and low-carb" in query:
        result = food_df_with_serving_size[
            (food_df_with_serving_size["Macronutrient Profile"] == "High-Protein") &
            (food_df_with_serving_size["Carbohydrates"] < 5)
        ]["Description"].tolist()
        return f"Here are some high-protein and low-carb foods: {', '.join(result[:10])}..."

    elif "high-protein" in query:
        result = food_df_with_serving_size[
            food_df_with_serving_size["Macronutrient Profile"] == "High-Protein"
        ]["Description"].tolist()
        return f"Here are some high-protein foods: {', '.join(result[:10])}..."

    elif "low-carb" in query:
        result = food_df_with_serving_size[
            food_df_with_serving_size["Macronutrient Profile"] == "Low-Carb"
        ]["Description"].tolist()
        return f"Here are some low-carb foods: {', '.join(result[:10])}..."

    elif "balanced" in query:
        result = food_df_with_serving_size[
            food_df_with_serving_size["Macronutrient Profile"] == "Balanced"
        ]["Description"].tolist()
        return f"Here are some foods with balanced macronutrients: {', '.join(result[:10])}..."

    # Protein Category Queries
    elif "very high-protein" in query:
        result = food_df_with_serving_size[
            food_df_with_serving_size["Protein Category"] == "Very High-Protein"
        ]["Description"].tolist()
        return f"Here are some very high-protein foods: {', '.join(result[:10])}..."

    elif "high-protein" in query:
        result = food_df_with_serving_size[
            food_df_with_serving_size["Protein Category"] == "High-Protein"
        ]["Description"].tolist()
        return f"Here are some high-protein foods: {', '.join(result[:10])}..."

    elif "moderate-protein" in query:
        result = food_df_with_serving_size[
            food_df_with_serving_size["Protein Category"] == "Moderate-Protein"
        ]["Description"].tolist()
        return f"Here are some moderate-protein foods: {', '.join(result[:10])}..."

    elif "low-protein" in query:
        result = food_df_with_serving_size[
            food_df_with_serving_size["Protein Category"] == "Low-Protein"
        ]["Description"].tolist()
        return f"Here are some low-protein foods: {', '.join(result[:10])}..."

    # Caloric Density Queries
    elif "high-calorie" in query:
        result = food_df_with_serving_size[
            food_df_with_serving_size["Caloric Density"] == "High-Calorie"
        ]["Description"].tolist()
        return f"Here are some high-calorie foods: {', '.join(result[:10])}..."

    elif "moderate-calorie" in query:
        result = food_df_with_serving_size[
            food_df_with_serving_size["Caloric Density"] == "Moderate-Calorie"
        ]["Description"].tolist()
        return f"Here are some moderate-calorie foods: {', '.join(result[:10])}..."

    elif "low-calorie" in query:
        result = food_df_with_serving_size[
            food_df_with_serving_size["Caloric Density"] == "Low-Calorie"
        ]["Description"].tolist()
        return f"Here are some low-calorie foods: {', '.join(result[:10])}..."

    # Default Response
    else:
        return "I'm sorry, I didn't understand that. Can you ask about specific protein, macronutrient, or caloric density levels?"


In [55]:
# Test macronutrient profile queries
print(chatbot_response("What are some high-protein foods?"))
print(chatbot_response("List low-carb options."))
print(chatbot_response("What foods are balanced in macronutrients?"))
print(chatbot_response("Can you suggest high-protein and low-carb foods?"))

# Test protein category queries
print(chatbot_response("What are very high-protein foods?"))
print(chatbot_response("What foods are low in protein?"))

# Test calorie category queries
print(chatbot_response("What are some high-calorie foods?"))
print(chatbot_response("What are moderate-calorie foods?"))
print(chatbot_response("Can you suggest low-calorie foods?"))


Here are some high-protein foods: Nuts  almonds, Egg  white, Cheese  parmesan, Cheese  pasteurized process, Seeds  sunflower seed kernels, Cheese  cheddar, Cheese  mozzarella, Egg  whole, Egg  yolk, Egg  yolk...
Here are some low-carb foods: Beans  snap, Frankfurter  beef, Kale  raw, Egg  whole, Egg  white, Pickles  cucumber, Cheese  cottage, Yogurt  Greek, Oil  coconut, Olives  green...
Here are some foods with balanced macronutrients: Tomatoes  grape, Grapefruit juice  white, Peaches  yellow, Bread  white, Kale  frozen, Mustard  prepared, Kiwifruit  green, Nectarines  raw, Yogurt  Greek, Sauce  pasta...
Here are some high-protein and low-carb foods: Cheese  cheddar, Cheese  mozzarella, Egg  whole, Egg  yolk, Egg  yolk, Chicken  broilers or fryers, Chicken  broiler or fryers, Ham  sliced, Fish  haddock, Fish  tuna...
Here are some high-protein foods: Nuts  almonds, Egg  white, Cheese  parmesan, Cheese  pasteurized process, Seeds  sunflower seed kernels, Cheese  cheddar, Cheese  mozzar

# Fine-Tune GPT-2

Generate Conversational Training Data

In [57]:
import random

def generate_templates():
    # Base components for question templates
    question_starters = [
        "What foods are", "Can you recommend foods that are",
        "Which foods should I eat to", "What are the best foods for",
        "Can you suggest foods with", "What should I eat to",
        "Do you know foods that are", "What are some options for"
    ]
    nutrient_focus = [
        "high in protein", "low in carbohydrates", "balanced in macronutrients",
        "rich in healthy fats", "low in calories", "high in calories",
        "suitable for weight loss", "ideal for muscle building",
        "good for heart health", "helpful for boosting energy"
    ]
    comparisons = [
        "higher protein than {other_food}", "fewer carbs than {other_food}",
        "more calories per gram than {other_food}", "less fat than {other_food}"
    ]
    serving_contexts = [
        "in a 100g serving", "per gram", "in a standard portion size",
        "in a typical serving size"
    ]

    # Generate 1000 unique question templates
    question_templates = []
    for _ in range(500):  # Base templates
        question_templates.append(
            f"{random.choice(question_starters)} {random.choice(nutrient_focus)}?"
        )
    for _ in range(500):  # Comparison-based templates
        question_templates.append(
            f"{random.choice(question_starters)} {random.choice(comparisons)}?"
        )

    # Base components for answer templates
    answer_starters = [
        "{food} is a great choice because it contains", "You can try {food}, which provides",
        "Consider {food}, as it offers", "{food} has", "A serving of {food} includes"
    ]
    nutrient_values = [
        "{value}g of protein", "{value}g of carbohydrates", "{value}g of fat",
        "{value} calories", "{value} calories per gram"
    ]
    contexts = [
        "per serving", "in a 100g portion", "for every gram", "in its typical portion size"
    ]
    health_benefits = [
        "which is perfect for building muscle", "great for a low-carb diet",
        "ideal for weight loss", "helpful for sustained energy",
        "recommended for heart health"
    ]

    # Generate 1000 unique answer templates
    answer_templates = []
    for _ in range(500):  # Base answers
        answer_templates.append(
            f"{random.choice(answer_starters)} {random.choice(nutrient_values)} {random.choice(contexts)}."
        )
    for _ in range(500):  # Contextualized answers
        answer_templates.append(
            f"{random.choice(answer_starters)} {random.choice(nutrient_values)} {random.choice(contexts)}, {random.choice(health_benefits)}."
        )

    return question_templates, answer_templates

# Generate the templates
questions, answers = generate_templates()

# Save templates to files
with open("question_templates.txt", "w") as q_file:
    q_file.writelines([q + "\n" for q in questions])

with open("answer_templates.txt", "w") as a_file:
    a_file.writelines([a + "\n" for a in answers])

print(f"Generated {len(questions)} question templates and {len(answers)} answer templates.")



Generated 1000 question templates and 1000 answer templates.


In [59]:
import random

def generate_training_data(question_file, answer_file, dataset):
    # Load question and answer templates
    with open(question_file, "r") as qf:
        question_templates = [line.strip() for line in qf.readlines()]

    with open(answer_file, "r") as af:
        answer_templates = [line.strip() for line in af.readlines()]

    training_data = []

    # Iterate through the dataset and pair questions and answers
    for _, row in dataset.iterrows():
        food = row["Description"]
        serving_size = row["Serving Size (g)"]
        calories = row["Calories"]
        protein = row["Protein"]
        carbs = row["Carbohydrates"]
        fat = row["Fat"]

        # Randomly select a question and answer template
        for _ in range(5):  # Generate multiple conversations per food item
            question_template = random.choice(question_templates)
            answer_template = random.choice(answer_templates)

            # Replace placeholders with actual data
            question = question_template.format(
                food=food,
                value=f"{random.choice([protein, calories, carbs, fat])}g",
                other_food=random.choice(dataset["Description"])
            )
            answer = answer_template.format(
                food=food,
                value=f"{protein}g of protein, {carbs}g of carbohydrates, {fat}g of fat",
                serving_size=f"{serving_size}g",
                context="building muscle"
            )

            # Format as a conversation
            conversation = f"User: {question}\nBot: {answer}\n<|endoftext|>"
            training_data.append(conversation)

    return training_data

# Example Usage
training_conversations = generate_training_data(
    question_file="question_templates.txt",
    answer_file="answer_templates.txt",
    dataset=food_df_with_serving_size  # Replace with your DataFrame variable
)

# Save to a training data file
with open("training_data_conversations.txt", "w") as file:
    file.writelines(training_conversations)

print(f"Generated {len(training_conversations)} training examples.")


Generated 1580 training examples.


## Fine Tuning using generated conversational training examples

In [92]:
#pip install transformers datasets torch


In [60]:
from datasets import load_dataset

# Load the text dataset
dataset = load_dataset("text", data_files={"train": "training_data_conversations.txt"})


Generating train split: 0 examples [00:00, ? examples/s]

In [61]:
from transformers import GPT2Tokenizer

# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(examples):
    encodings = tokenizer(
        examples["text"],
        truncation=True,
        max_length=128,
        padding="max_length",  #
    )
    encodings["labels"] = encodings["input_ids"].copy()  # Add labels
    return encodings

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])


# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/3161 [00:00<?, ? examples/s]

In [62]:
from transformers import GPT2LMHeadModel

# Load GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments


# Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
    report_to=[],  # Disable wandb and other reporting tools
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    tokenizer=tokenizer,  # Ensure tokenizer includes pad_token
)

# Fine-tune the model
trainer.train()



  trainer = Trainer(


Step,Training Loss
100,0.5521
200,0.2263
300,0.2417
400,0.1927
500,0.1875
600,0.1848
700,0.1747
800,0.1617
900,0.1637
1000,0.1477


In [101]:
# Save the model after training
trainer.save_model("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")

('./gpt2-finetuned/tokenizer_config.json',
 './gpt2-finetuned/special_tokens_map.json',
 './gpt2-finetuned/vocab.json',
 './gpt2-finetuned/merges.txt',
 './gpt2-finetuned/added_tokens.json')

In [102]:
# Load the fine-tuned model
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model = GPT2LMHeadModel.from_pretrained("./gpt2-finetuned")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-finetuned")

def generate_response(prompt):
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    outputs = model.generate(inputs, max_length=50, num_return_sequences=1, temperature=0.7)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test the chatbot
print(generate_response("User: What are some high-protein foods?\nBot:"))
print(generate_response("User: What are low-calorie foods?\nBot:"))


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


User: What are some high-protein foods?
Bot: Beans  Dry.
User: What are low-calorie foods?
Bot: Beans  Dry.


In [103]:
# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./gpt2-finetuned")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-finetuned")

# Ensure pad_token is set
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 uses eos_token for padding

# Generate response with well-structured prompts and advanced generation settings
def generate_response(prompt):
    # Crafting the prompt for better user interaction
    structured_prompt = f"User: {prompt}\nBot:"

    # Tokenize and prepare the input
    inputs = tokenizer.encode(structured_prompt, return_tensors="pt", padding=True, truncation=True)
    attention_mask = inputs.ne(tokenizer.pad_token_id).to(inputs.device)  # Create attention mask

    # Generate response with temperature, sampling, and pad_token_id adjustments
    outputs = model.generate(
        inputs,
        attention_mask=attention_mask,
        max_length=50,  # Adjust the max length of the output response
        num_return_sequences=3,  # Number of responses to generate
        temperature=0.9,  # Control randomness (higher = more diverse output)
        do_sample=True,  # Enable sampling for varied responses
        pad_token_id=tokenizer.eos_token_id  # Use eos_token_id as pad_token to prevent issues
    )

    # Decode and return the response, skipping special tokens
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test the chatbot with different queries
print(generate_response("What are some high-protein foods?"))
print(generate_response("What are low-calorie foods?"))
print(generate_response("Can you suggest some balanced foods?"))

User: What are some high-protein foods?
Bot: Beans  dry.
User: What are low-calorie foods?
Bot: Beans  cannellini.
User: Can you suggest some balanced foods?
Bot: Beans  Dry.


In [104]:
# Prompt Engineering

print(generate_response("Can you provide a detailed list of foods that are good for a muscle-building diet? Please include a variety of options and mention their nutritional benefits."))


User: Can you provide a detailed list of foods that are good for a muscle-building diet? Please include a variety of options and mention their nutritional benefits.
Bot: Rice  red.


In [107]:
# Function to generate responses based on user queries
def generate_dynamic_response(user_input):
    # Define the prompt template that will instruct GPT-2 to generate detailed answers
    prompt = f"User: {user_input}\nBot:"

    # Tokenize the prompt
    inputs = tokenizer.encode(prompt, return_tensors="pt")

    # Generate a response
    outputs = model.generate(
        inputs,
        max_length=150,  # You can adjust this to control the length of the response
        num_return_sequences=1,
        temperature=0.7,  # Adjust this for response creativity (higher = more diverse)
        top_p=0.9,       # Top-p sampling for more controlled randomness
        do_sample=True,  # Enable sampling for more diverse responses
        pad_token_id=tokenizer.eos_token_id  # Use eos_token as pad_token
    )

    # Decode and return the generated response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Example of using the function with user input
user_input = "What do you recommend to eat to lose weight?"
response = generate_dynamic_response(user_input)
print(response)

User: What do you recommend to eat to lose weight?
Bot: Flour  cassava.
