In [None]:
import pandas as pd
import ast
import random

food_file = "food_dataset_with_nutriition.csv"
food_df = pd.read_csv(food_file)
food_df["NER_list"] = food_df["NER"].apply(ast.literal_eval)
print("Total meals in food dataset:", len(food_df))

disease_file = "real_disease_names.csv"
disease_df = pd.read_csv(disease_file)
disease_names = disease_df["Disease"].tolist()
print("Number of real disease names loaded:", len(disease_names))

nutrition_mapping = {
    "calories": ("low calories", "high calories"),
    "Total fats": ("low total fats", "high total fats"),
    "Carbohydrate": ("low carbohydrate", "high carbohydrate"),
    "Fiber": ("high fiber", "low fiber"),
    "Protein": ("high protein", "low protein"),
    "Cholesterol": ("low cholesterol", "high cholesterol"),
    "Calcium": ("high calcium", "low calcium"),
    "Iron": ("high iron", "low iron"),
    "Magnesium": ("high magnesium", "low magnesium"),
    "Potassium": ("high potassium", "low potassium"),
    "Sodium": ("low sodium", "high sodium"),
    "Vitamin C": ("high vitamin c", "low vitamin c")
}
nutrition_facts = list(nutrition_mapping.keys())

mapping_data = []
for disease in disease_names:
    best_meal = food_df.sample(n=1).iloc[0]
    meal_ingredients = best_meal["NER_list"]
    num_best_food = random.randint(2, 3)
    if len(meal_ingredients) >= num_best_food:
        best_food_sample = random.sample(meal_ingredients, num_best_food)
    else:
        best_food_sample = meal_ingredients
    best_food_str = str(sorted(best_food_sample))
    
    attempts = 0
    while True:
        worst_meal = food_df.sample(n=1).iloc[0]
        if worst_meal["NER_list"] != best_meal["NER_list"]:
            break
        attempts += 1
        if attempts > 10:
            break
    meal_ingredients_worst = worst_meal["NER_list"]
    num_worst_food = random.randint(2, 3)
    if len(meal_ingredients_worst) >= num_worst_food:
        worst_food_sample = random.sample(meal_ingredients_worst, num_worst_food)
    else:
        worst_food_sample = meal_ingredients_worst
    worst_food_str = str(sorted(worst_food_sample))
    
    num_best_nutrition = random.randint(2, 3)
    best_nutrition_sample = random.sample(nutrition_facts, num_best_nutrition)
    best_nutrition_list = [nutrition_mapping[nf][0] for nf in best_nutrition_sample]
    
    remaining_nutrition = list(set(nutrition_facts) - set(best_nutrition_sample))
    if remaining_nutrition:
        num_worst_nutrition = random.randint(2, min(3, len(remaining_nutrition)))
        worst_nutrition_sample = random.sample(remaining_nutrition, num_worst_nutrition)
    else:
        worst_nutrition_sample = []
    worst_nutrition_list = [nutrition_mapping[nf][1] for nf in worst_nutrition_sample]
    
    best_nutrition_str = str(sorted(best_nutrition_list))
    worst_nutrition_str = str(sorted(worst_nutrition_list))
    
    mapping_data.append({
        "Disease": disease,
        "Best_Foods": best_food_str,
        "Worst_Foods": worst_food_str,
        "Best_Nutrition": best_nutrition_str,
        "Worst_Nutrition": worst_nutrition_str
    })

mapping_df = pd.DataFrame(mapping_data)
output_filename = "disease_food_nutrition_mapping.csv"
mapping_df.to_csv(output_filename, index=False)
print(f"CSV file '{output_filename}' created with {len(mapping_df)} rows.")


Total meals in food dataset: 9882
Number of real disease names loaded: 11837
CSV file 'disease_food_nutrition_mapping.csv' created with 11837 rows.
