In [1]:
import pandas as pd
import json
import re

print("TASK 3: IMPROVED PROMPTS")

# Load the data to analyze patterns
df = pd.read_csv("gemma_annotation.csv")

print("\nAnalyzing patterns in original data...")

# Analyze common issues in the data
patterns = {
    'ingredient_patterns': {},
    'amount_patterns': {},
    'common_issues': []
}

# Sample analysis of first 100 rows
for idx, row in df.head(100).iterrows():
    ingredient = str(row['ingredient']).lower()
    amount = str(row['amount']).lower()
    
    # Check ingredient patterns
    words = ingredient.split()
    if len(words) > 2:
        patterns['common_issues'].append("Multi-word ingredients")
    
    if any(word in ingredient for word in ['frisch', 'getrocknet', 'gemahlen']):
        patterns['common_issues'].append("Processing terms in ingredients")
    
    if any(word in ingredient for word in ['rot', 'grün', 'gelb', 'weiß']):
        patterns['common_issues'].append("Color adjectives")
    
    if any(word in ingredient for word in ['päckchen', 'dose', 'glas', 'pack']):
        patterns['common_issues'].append("Packaging terms in ingredients")
    
    # Check amount patterns
    if '/' in amount or '½' in amount or '¼' in amount:
        patterns['common_issues'].append("Fractions in amounts")
    
    if any(word in amount for word in ['ca.', 'etwa', 'circa']):
        patterns['common_issues'].append("Approximation terms")

# Count pattern frequencies
pattern_counts = {}
for issue in patterns['common_issues']:
    pattern_counts[issue] = pattern_counts.get(issue, 0) + 1

print("\nCommon issues found in data:")
for issue, count in pattern_counts.items():
    print(f"  {issue}: {count} occurrences")

# Create improved prompts based on analysis
print("\nCreating improved prompts...")

IMPROVED_INGREDIENT_PROMPT = """Extract cooking ingredient from German text and output valid JSON.

RULES:
1. Extract only the main ingredient
2. Convert to singular nominative form
3. Remove:
   - Brand names (Ja!, Dr. Oetker)
   - Packaging terms (Päckchen, Dose, Glas, Pack)
   - Processing terms (frisch, getrocknet, gemahlen)
   - Color adjectives (rot, grün, gelb)
4. Output format: {"zutat": "INGREDIENT"}

EXAMPLES:
Input: "frische Tomaten" → {"zutat": "Tomate"}
Input: "rote Paprika" → {"zutat": "Paprika"}
Input: "1 Päckchen Vanillezucker" → {"zutat": "Vanillezucker"}
Input: "Ja! Tomatenmark" → {"zutat": "Tomatenmark"}
Input: "gemahlener Pfeffer" → {"zutat": "Pfeffer"}

Now process: {ingredient_text}"""

IMPROVED_AMOUNT_PROMPT = """Extract measurement from German cooking text and output valid JSON.

RULES:
1. Identify measurement type:
   - Weight → use "gewicht" (for g, kg, mg)
   - Volume → use "volumen" (for ml, l, el, tl, tasse, becher)
   - Count → use "anzahl" (for Stück, Scheibe, Zehe, Bund)
2. Always include "einheit" with the exact unit
3. Convert fractions:
   - 1/2 → 0.5
   - ½ → 0.5
   - ¼ → 0.25
4. Remove approximation terms (ca., etwa, circa)

EXAMPLES:
Input: "200 g" → {"gewicht": 200, "einheit": "g"}
Input: "2 EL" → {"volumen": 2, "einheit": "el"}
Input: "1/2 TL" → {"volumen": 0.5, "einheit": "tl"}
Input: "ca. 3 Stück" → {"anzahl": 3, "einheit": "stück"}
Input: "1 ½ Tasse" → {"volumen": 1.5, "einheit": "tasse"}

Now process: {amount_text}"""

print("\n✅ Improved prompts created")
print("\nKey improvements:")
print("1. Handles fractions and approximations")
print("2. Removes brand names and packaging terms")
print("3. Better unit classification")
print("4. Clearer examples for common cases")

# Test the prompts with sample data
print("\nTesting improved prompts on sample data:")

test_cases = [
    ("frische Tomaten", "IMPROVED: Should extract 'Tomate' (removes 'frische')"),
    ("rote Paprika", "IMPROVED: Should extract 'Paprika' (removes 'rote')"),
    ("1 Päckchen Vanillezucker", "IMPROVED: Should extract 'Vanillezucker' (removes packaging)"),
    ("200 g", "IMPROVED: Should output {'gewicht': 200, 'einheit': 'g'}"),
    ("1/2 TL", "IMPROVED: Should convert fraction: {'volumen': 0.5, 'einheit': 'tl'}"),
    ("ca. 3 Stück", "IMPROVED: Should remove 'ca.': {'anzahl': 3, 'einheit': 'stück'}"),
]

for text, explanation in test_cases:
    print(f"\n  Input: '{text}'")
    print(f"  {explanation}")

# Save the improved prompts
improved_prompts = {
    "analysis_of_original_data": {
        "sample_size": 100,
        "common_patterns_found": pattern_counts,
        "data_source": "gemma_annotation.csv"
    },
    "improved_ingredient_prompt": IMPROVED_INGREDIENT_PROMPT,
    "improved_amount_prompt": IMPROVED_AMOUNT_PROMPT,
    "reasoning": "Prompts improved to handle common patterns found in cooking ingredient data: fractions, packaging terms, brand names, and approximation terms."
}

with open('improved_prompts_task3.json', 'w', encoding='utf-8') as f:
    json.dump(improved_prompts, f, ensure_ascii=False, indent=2)

print("\n" + "="*60)
print("TASK 3 COMPLETE")
print("="*60)
print("\n✅ Created file: improved_prompts_task3.json")
print("\nThis file contains:")
print("1. Analysis of patterns in original data")
print("2. Improved ingredient extraction prompt")
print("3. Improved amount extraction prompt")
print("4. Explanation of improvements")

TASK 3: IMPROVED PROMPTS

Analyzing patterns in original data...

Common issues found in data:
  Color adjectives: 7 occurrences
  Processing terms in ingredients: 20 occurrences
  Multi-word ingredients: 13 occurrences
  Fractions in amounts: 1 occurrences

Creating improved prompts...

✅ Improved prompts created

Key improvements:
1. Handles fractions and approximations
2. Removes brand names and packaging terms
3. Better unit classification
4. Clearer examples for common cases

Testing improved prompts on sample data:

  Input: 'frische Tomaten'
  IMPROVED: Should extract 'Tomate' (removes 'frische')

  Input: 'rote Paprika'
  IMPROVED: Should extract 'Paprika' (removes 'rote')

  Input: '1 Päckchen Vanillezucker'
  IMPROVED: Should extract 'Vanillezucker' (removes packaging)

  Input: '200 g'
  IMPROVED: Should output {'gewicht': 200, 'einheit': 'g'}

  Input: '1/2 TL'
  IMPROVED: Should convert fraction: {'volumen': 0.5, 'einheit': 'tl'}

  Input: 'ca. 3 Stück'
  IMPROVED: Should 