In [None]:
import pandas as pd
import requests
import json
import time
from concurrent.futures import ThreadPoolExecutor
from collections import defaultdict

API_KEY = 'your api'
SEARCH_URL = 'https://api.nal.usda.gov/fdc/v1/foods/search'

desired_nutrients = {
    "Energy": "calories",
    "Total lipid (fat)": "Total fats",
    "Carbohydrate, by difference": "Carbohydrate",
    "Fiber, total dietary": "Fiber",
    "Protein": "Protein",
    "Cholesterol": "Cholesterol",
    "Calcium, Ca": "Calcium",
    "Iron, Fe": "Iron",
    "Magnesium, Mg": "Magnesium",
    "Potassium, K": "Potassium",
    "Sodium, Na": "Sodium",
    "Vitamin C, total ascorbic acid": "Vitamin C"
}

nutrition_cache = {}

def fetch_nutrition_data(query):
    if query in nutrition_cache:
        return nutrition_cache[query]
    params = {
        'api_key': API_KEY,
        'query': query,
        'pageSize': 1
    }
    max_retries = 3
    retry_delay = 1
    for attempt in range(max_retries):
        try:
            response = requests.get(SEARCH_URL, params=params, timeout=10)
            if response.status_code == 200:
                data = response.json()
                if data.get('foods'):
                    food_item = data['foods'][0]
                    nutrients = {}
                    for nutrient in food_item.get('foodNutrients', []):
                        name = nutrient.get('nutrientName')
                        value = nutrient.get('value', 0)
                        if name and value is not None:
                            nutrients[name] = value
                    nutrition_cache[query] = nutrients
                    return nutrients
                else:
                    print(f"No results found for '{query}'.")
                    break
            elif response.status_code == 429:
                time.sleep(retry_delay)
                retry_delay *= 2
                continue
            else:
                print(f"Error fetching data for '{query}': HTTP {response.status_code}")
                break
        except requests.exceptions.RequestException as e:
            print(f"Request exception for '{query}': {e}")
            time.sleep(retry_delay)
            retry_delay *= 2
            continue
    empty_result = {}
    nutrition_cache[query] = empty_result
    return empty_result

def extract_unique_ingredients(df):
    unique_ingredients = set()
    for idx, row in df.iterrows():
        try:
            if isinstance(row['NER'], str):
                ingredients = json.loads(row['NER'])
                for ingredient in ingredients:
                    unique_ingredients.add(ingredient.strip())
        except (json.JSONDecodeError, TypeError) as e:
            print(f"Error parsing row {idx}: {e}")
    return unique_ingredients

def prefetch_ingredients(ingredients):
    def fetch_and_store(ingredient):
        if ingredient and ingredient not in nutrition_cache:
            result = fetch_nutrition_data(ingredient)
            print(f"Fetched data for '{ingredient}'")
            time.sleep(0.5)
            return ingredient, result
        return None, None
    print(f"Prefetching nutrition data for {len(ingredients)} unique ingredients...")
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = [executor.submit(fetch_and_store, ingredient) for ingredient in ingredients if ingredient]
        for future in futures:
            ingredient, result = future.result()
            if ingredient:
                nutrition_cache[ingredient] = result
    print(f"Prefetching complete. Cached {len(nutrition_cache)} ingredients.")

def aggregate_nutrition(ner_entry):
    if isinstance(ner_entry, str):
        try:
            ingredients = json.loads(ner_entry)
        except json.JSONDecodeError as e:
            print("Error parsing NER entry:", e)
            ingredients = []
    else:
        ingredients = ner_entry
    aggregated = {std_key: 0 for std_key in desired_nutrients.values()}
    for ingredient in ingredients:
        ingredient = ingredient.strip()
        if not ingredient:
            continue
        nutrient_data = nutrition_cache.get(ingredient, {})
        for usda_name, std_key in desired_nutrients.items():
            aggregated[std_key] += nutrient_data.get(usda_name, 0)
    return aggregated

def process_in_batches(df, batch_size=500):
    total_rows = len(df)
    result_df = pd.DataFrame()
    for start_idx in range(0, total_rows, batch_size):
        end_idx = min(start_idx + batch_size, total_rows)
        print(f"Processing batch {start_idx+1} to {end_idx} of {total_rows}...")
        batch = df.iloc[start_idx:end_idx].copy()
        aggregated_nutrition_list = []
        for idx, row in batch.iterrows():
            nutrition = aggregate_nutrition(row['NER'])
            aggregated_nutrition_list.append(nutrition)
            if (idx - start_idx + 1) % 100 == 0:
                print(f"  Processed {idx - start_idx + 1}/{end_idx - start_idx} rows in current batch")
        batch['nutrition'] = aggregated_nutrition_list
        nutrient_order = [
            "calories", "Total fats", "Carbohydrate", "Fiber", 
            "Protein", "Cholesterol", "Calcium", "Iron", 
            "Magnesium", "Potassium", "Sodium", "Vitamin C"
        ]
        nutrition_df = batch['nutrition'].apply(pd.Series)[nutrient_order]
        batch = pd.concat([batch, nutrition_df], axis=1)
        result_df = pd.concat([result_df, batch], ignore_index=True)
        result_df.to_csv(f'food_dataset_with_nutrition_partial_{end_idx}.csv', index=False)
        print(f"Saved intermediate results up to row {end_idx}")
    return result_df

def main():
    start_time = time.time()
    print("Loading dataset...")
    df = pd.read_csv('filtered_recipes_data.csv')
    print("Extracting unique ingredients...")
    unique_ingredients = extract_unique_ingredients(df)
    print(f"Found {len(unique_ingredients)} unique ingredients")
    prefetch_ingredients(unique_ingredients)
    result_df = process_in_batches(df)
    result_df.to_csv('food_dataset_with_nutriition.csv', index=False)
    execution_time = time.time() - start_time
    print(f"Completed in {execution_time:.2f} seconds")
    print(f"Average time per row: {execution_time/len(df):.2f} seconds")
    print("Updated CSV with the selected nutrition facts has been saved.")

if __name__ == "__main__":
    main()
