In [2]:
import pandas as pd
import json
import numpy as np

df = pd.read_csv("gemma_annotation.csv")

def check_column_content(text):
    if pd.isna(text):
        return 'unknown'
    try:
        data = json.loads(str(text))
        if 'zutat' in data:
            return 'ingredient'
        if any(key in data for key in ['gewicht', 'volumen', 'anzahl', 'menge']):
            return 'amount'
        return 'unknown'
    except:
        return 'unknown'

sample_check = df.head(20)
ingr_types = sample_check['ingr_annotation'].apply(check_column_content).value_counts()
amt_types = sample_check['amount_annotation'].apply(check_column_content).value_counts()

if ingr_types.get('amount', 0) > ingr_types.get('ingredient', 0):
    df = df.rename(columns={
        'ingr_annotation': 'amount_annotation',
        'amount_annotation': 'ingr_annotation'
    })

unit_data = {}
for idx, row in df.iterrows():
    try:
        amt_json = row['amount_annotation']
        if pd.isna(amt_json):
            continue
            
        amt_data = json.loads(str(amt_json))
        unit = str(amt_data.get('einheit', '')).lower().strip()
        if not unit:
            continue
            
        if unit not in unit_data:
            unit_data[unit] = {'count': 0, 'values': [], 'ingredients': set()}
        
        unit_data[unit]['count'] += 1
        
        for key in ['gewicht', 'volumen', 'anzahl', 'menge']:
            if key in amt_data and amt_data[key] is not None:
                try:
                    val = str(amt_data[key]).replace(',', '.')
                    if '/' in val:
                        num, den = val.split('/')
                        value = float(num) / float(den)
                    else:
                        value = float(val)
                    unit_data[unit]['values'].append(value)
                    break
                except:
                    continue
        
        try:
            ingr_json = row['ingr_annotation']
            ingr_data = json.loads(str(ingr_json))
            ingredient = ingr_data.get('zutat', row['ingredient'])
            unit_data[unit]['ingredients'].add(ingredient)
        except:
            unit_data[unit]['ingredients'].add(row['ingredient'])
            
    except:
        continue

unit_factors = {
    'g': 1, 'gramm': 1, 'gr': 1,
    'kg': 1000,
    'ml': 1,
    'l': 1000, 'liter': 1000,
    'el': 15, 'esslöffel': 15,
    'tl': 5, 'teelöffel': 5,
    'tasse': 125,
    'becher': 150,
    'glas': 200,
    'bund': 20, 'bündel': 20,
    'prise': 0.5,
    'msp': 1,
    'schuss': 20,
    'spritzer': 2,
    'dose': 400,
    'päckchen': 100, 'pck': 100,
    'stück': 1, 'stk': 1, 'st': 1,
    'scheibe': 1,
    'zehe': 1,
    'blatt': 1,
    'kopf': 1,
    'stange': 1,
    'würfel': 1,
}

def normalize_measurement(amount_json):
    result = {'normalized_value': None, 'normalized_unit': None, 'error': None}
    
    if pd.isna(amount_json):
        result['error'] = 'no data'
        return result
    
    try:
        data = json.loads(str(amount_json))
    except:
        result['error'] = 'json error'
        return result
    
    unit = str(data.get('einheit', '')).lower().strip()
    
    value = None
    mtype = None
    for key in ['gewicht', 'volumen', 'anzahl', 'menge']:
        if key in data and data[key] is not None:
            try:
                val = str(data[key]).replace(',', '.')
                if '/' in val:
                    num, den = val.split('/')
                    value = float(num) / float(den)
                else:
                    value = float(val)
                mtype = key
                break
            except:
                continue
    
    if value is None:
        result['error'] = 'no value'
        return result
    
    if mtype == 'gewicht':
        factor = unit_factors.get(unit, 1)
        result['normalized_value'] = value * factor
        result['normalized_unit'] = 'grams'
    
    elif mtype == 'volumen':
        factor = unit_factors.get(unit, 1)
        ml_value = value * factor
        result['normalized_value'] = ml_value / 1000.0
        result['normalized_unit'] = 'liters'
    
    else:
        if unit in unit_factors:
            factor = unit_factors[unit]
            if unit in ['g', 'gramm', 'kg', 'gr', 'bund', 'päckchen', 'dose']:
                result['normalized_value'] = value * factor
                result['normalized_unit'] = 'grams'
            elif unit in ['ml', 'l', 'liter', 'el', 'tl', 'tasse', 'becher', 'glas']:
                ml_value = value * factor
                result['normalized_value'] = ml_value / 1000.0
                result['normalized_unit'] = 'liters'
            else:
                result['normalized_value'] = value * factor
                result['normalized_unit'] = 'pieces'
        else:
            result['normalized_value'] = value
            result['normalized_unit'] = 'pieces'
    
    return result

df['normalization'] = df['amount_annotation'].apply(normalize_measurement)

df['normalized_value'] = df['normalization'].apply(lambda x: x.get('normalized_value'))
df['normalized_unit'] = df['normalization'].apply(lambda x: x.get('normalized_unit'))
df['error'] = df['normalization'].apply(lambda x: x.get('error'))

df[['ingredient', 'amount', 'normalized_value', 'normalized_unit']].to_csv('normalized_data.csv', index=False)

analysis_data = []
for unit, info in unit_data.items():
    if info['values']:
        avg_val = np.mean(info['values'])
        analysis_data.append({
            'unit': unit,
            'count': info['count'],
            'average_value': round(avg_val, 2),
            'ingredient_count': len(info['ingredients'])
        })

analysis_df = pd.DataFrame(analysis_data)
analysis_df = analysis_df.sort_values('count', ascending=False)
analysis_df.to_csv('unit_analysis.csv', index=False)

conversion_table = []
for unit, factor in unit_factors.items():
    if unit in unit_data:
        count = unit_data[unit]['count']
    else:
        count = 0
    
    if unit in ['g', 'gramm', 'kg', 'gr', 'bund', 'päckchen', 'dose', 'prise', 'msp']:
        target = 'grams'
    elif unit in ['ml', 'l', 'liter', 'el', 'tl', 'tasse', 'becher', 'glas', 'schuss', 'spritzer']:
        target = 'liters'
    else:
        target = 'pieces'
    
    conversion_table.append({
        'unit': unit,
        'factor': factor,
        'normalizes_to': target,
        'occurrences': count
    })

conversion_df = pd.DataFrame(conversion_table)
conversion_df = conversion_df.sort_values('occurrences', ascending=False)
conversion_df.to_csv('conversion_table.csv', index=False)

print("Task 1 complete.")
print(f"Rows processed: {len(df)}")
print(f"Normalized units: {df['normalized_unit'].value_counts().to_dict()}")
print(f"Files created: normalized_data.csv, unit_analysis.csv, conversion_table.csv")

Task 1 complete.
Rows processed: 910
Normalized units: {'pieces': 409, 'liters': 251, 'grams': 240}
Files created: normalized_data.csv, unit_analysis.csv, conversion_table.csv
