In [1]:
import pandas as pd
import json
import requests
import re

print("ASSIGNMENT 1 - TASKS 2 & 3")

# ============================================================================
# TASK 2: NUTRITIONAL VALUES
# ============================================================================

print("\n" + "="*60)
print("TASK 2: NUTRITIONAL VALUES")
print("="*60)

# Load the original data
df = pd.read_csv("gemma_annotation.csv")

def call_nutrition_api(ingredient, amount_g):
    url = "https://smarthome.uni-regensburg.de/naehrwertrechner/"
    payload = {"ingredient": ingredient, "amount": amount_g}
    
    try:
        response = requests.post(url, json=payload, timeout=10)
        if response.status_code == 200:
            data = response.json()
            if "Nicht erkannt" in str(data):
                return {"status": "not_recognized", "data": data}
            return {"status": "success", "data": data}
        return {"status": "error", "data": f"HTTP {response.status_code}"}
    except Exception as e:
        return {"status": "error", "data": str(e)}

print("Processing nutritional values...")
print("Note: This will make API calls for all ingredients")
print("To skip, comment out the processing loop")

nutrition_results = []
failed_ingredients = []

# Process a sample first (remove [:] to process all)
for idx, row in df[:50].iterrows():  # Process only 50 for testing
    # Get ingredient name from JSON or use raw
    ingredient_name = row['ingredient']
    
    # Try to parse JSON for better ingredient name
    try:
        # Check which column has ingredient info
        col1 = row['ingr_annotation']
        col2 = row['amount_annotation']
        
        # Try both columns
        for col in [col1, col2]:
            try:
                data = json.loads(str(col))
                if 'zutat' in data:
                    ingredient_name = data['zutat']
                    break
            except:
                continue
    except:
        pass
    
    # Estimate amount in grams (simple approximation)
    amount_text = str(row['amount']).lower()
    amount_g = 100  # Default
    
    # Try to extract amount
    if 'g' in amount_text:
        try:
            num = re.findall(r'(\d+(?:\.\d+)?)', amount_text)
            if num:
                amount_g = float(num[0])
        except:
            pass
    elif any(x in amount_text for x in ['el', 'esslöffel']):
        amount_g = 15  # tablespoon ≈ 15g
    elif any(x in amount_text for x in ['tl', 'teelöffel']):
        amount_g = 5   # teaspoon ≈ 5g
    elif 'stück' in amount_text or 'stk' in amount_text:
        amount_g = 50  # piece ≈ 50g
    
    # API call
    result = call_nutrition_api(ingredient_name, amount_g)
    
    # Store result
    entry = {
        'original_ingredient': row['ingredient'],
        'original_amount': row['amount'],
        'cleaned_ingredient': ingredient_name,
        'amount_grams': amount_g,
        'api_status': result['status']
    }
    
    if result['status'] == 'not_recognized':
        failed_ingredients.append({
            'ingredient': ingredient_name,
            'amount_grams': amount_g,
            'api_response': result['data']
        })
    elif result['status'] == 'success':
        entry['nutrition_data'] = result['data']
    
    nutrition_results.append(entry)
    
    print(f"  {idx+1}: {ingredient_name} - {result['status']}")

# Save results
if nutrition_results:
    nutrition_df = pd.DataFrame(nutrition_results)
    nutrition_df.to_csv('nutritional_results.csv', index=False)
    print(f"\n✅ Saved nutritional_results.csv with {len(nutrition_results)} entries")

if failed_ingredients:
    with open('unrecognized_ingredients.json', 'w', encoding='utf-8') as f:
        json.dump(failed_ingredients, f, ensure_ascii=False, indent=2)
    print(f"✅ Saved unrecognized_ingredients.json with {len(failed_ingredients)} entries")

# ============================================================================
# TASK 3: IMPROVED PROMPTS
# ============================================================================

print("\n" + "="*60)
print("TASK 3: IMPROVED PROMPTS")
print("="*60)

print("\nAnalyzing patterns from original data...")

# Analyze original data for common patterns
patterns = {
    'multi_word': 0,
    'with_adjectives': 0,
    'with_packaging': 0,
    'with_processing': 0
}

for idx, row in df.head(100).iterrows():
    ing = str(row['ingredient']).lower()
    amt = str(row['amount']).lower()
    
    # Check ingredient patterns
    if len(ing.split()) > 2:
        patterns['multi_word'] += 1
    if any(word in ing for word in ['frisch', 'getrocknet', 'gemahlen', 'gerieben']):
        patterns['with_processing'] += 1
    if any(word in ing for word in ['rot', 'grün', 'gelb', 'weiß']):
        patterns['with_adjectives'] += 1
    
    # Check amount patterns
    if any(word in amt for word in ['päckchen', 'dose', 'glas', 'pack', 'packung']):
        patterns['with_packaging'] += 1

print(f"Patterns found in data:")
for key, value in patterns.items():
    print(f"  {key}: {value} occurrences")

# Create improved prompts based on analysis
IMPROVED_INGREDIENT_PROMPT = """Extract cooking ingredient from German text.

Rules:
1. Output only the main ingredient in singular form
2. Remove adjectives (rot, frisch, getrocknet)
3. Remove brand names
4. Remove packaging terms (Päckchen, Dose)
5. Output JSON: {"zutat": "ingredient"}

Examples:
"rote Paprika" → {"zutat": "Paprika"}
"frische Tomaten" → {"zutat": "Tomate"}
"1 Päckchen Vanillezucker" → {"zutat": "Vanillezucker"}

Input: {text}"""

IMPROVED_AMOUNT_PROMPT = """Extract measurement from German cooking text.

Rules:
1. Output JSON with correct type:
   - Weight: {"gewicht": value, "einheit": "g"/"kg"}
   - Volume: {"volumen": value, "einheit": "ml"/"l"/"el"/"tl"}
   - Count: {"anzahl": value, "einheit": "stück"/"scheibe"}
2. Convert fractions (1/2 → 0.5)
3. Include unit exactly as written

Examples:
"200 g" → {"gewicht": 200, "einheit": "g"}
"2 EL" → {"volumen": 2, "einheit": "el"}
"1/2 TL" → {"volumen": 0.5, "einheit": "tl"}
"3 Stück" → {"anzahl": 3, "einheit": "stück"}

Input: {text}"""

print("\n✅ Improved prompts created")
print("\nKey improvements:")
print("1. Removes adjectives and processing terms")
print("2. Handles packaging terms")
print("3. Better unit classification")
print("4. Fraction support")

# Save the prompts
prompts_data = {
    "analysis_of_original_data": patterns,
    "improved_ingredient_prompt": IMPROVED_INGREDIENT_PROMPT,
    "improved_amount_prompt": IMPROVED_AMOUNT_PROMPT,
    "explanation": "Prompts improved to handle common patterns found in cooking data"
}

with open('improved_prompts.json', 'w', encoding='utf-8') as f:
    json.dump(prompts_data, f, ensure_ascii=False, indent=2)

print("\n✅ Task 3 complete")
print("   File created: improved_prompts.json")

print("\n" + "="*60)
print("SUBMISSION FILES")
print("="*60)
print("For Task 2:")
print("- nutritional_results.csv (API results)")
print("- unrecognized_ingredients.json (failed items)")
print("\nFor Task 3:")
print("- improved_prompts.json (analysis & improved prompts)")
print("\nFor Task 1 (from earlier):")
print("- normalized_data.csv")
print("- unit_analysis.csv")
print("- conversion_table.csv")



ASSIGNMENT 1 - TASKS 2 & 3

TASK 2: NUTRITIONAL VALUES
Processing nutritional values...
Note: This will make API calls for all ingredients
To skip, comment out the processing loop
  1: Zucchini - error
  2: Paprika - error
  3: Feta - error
  4: Schinken - error
  5: Zwiebel - error
  6: Tomate - error
  7: Hähnchenbrust - error
  8: Zwiebel - error
  9: Austernpilz - error
  10: Schmand - error
  11: Frischkäse - error
  12: Öl - error
  13: Butter - error
  14: Tomatenmark - error
  15: Majoran - error
  16: Hähnchenschlegel - error
  17: Hähnchen - error
  18: Schuss - error
  19: Zwiebel - error
  20: Rosenkohl - error
  21: Karotte - error
  22: Lauch - error
  23: Wein - error
  24: Champignon - error
  25: Sahne - error
  26: Butter - error
  27: Mehl - error
  28: Frischkäse - error
  29: Milch - error
  30: Zucker - error
  31: Heidelbeere - error
  32: Schokoriegel - error
  33: Amarettini - error
  34: Schirmchen - error
  35: Möhre - error
  36: Kohlrabi - error
  37: Papri