In [None]:
import pandas as pd
import glob

csv_files = glob.glob(r'C:\Users\jonas\Desktop\Studium\Master\SS 2025\Anwendungsfelder Business Analytics\Daten\*.csv')

# Liste für alle DataFrames
all_dataframes = []

# Spalten, die wir extrahieren möchten
general_info_keys = [
    'measures_from', 'measures_to', 'measurement_category',
    'measurement_unit', 'measurement_type', 'measurement_value_type',
    'measurement_frequency', 'location_id', 'category', 'usage',
    'usage_detail', 'country', 'federal_state', 'city', 'post_code',
    'area', 'construction_year'
]

for file in csv_files:
    df = pd.read_csv(file, header=None, names=["field", "value"])

    # Allgemeine Informationen extrahieren
    general_info_section = df.iloc[1:18].dropna()
    general_info = {}
    for key in general_info_keys:
        match = general_info_section[general_info_section['field'] == key]
        if not match.empty:
            general_info[key] = match['value'].values[0]
        else:
            general_info[key] = None  # falls fehlend

    # Messdaten einlesen
    measurements = df.iloc[22:].dropna()
    measurements.columns = ['Zeitstempel', 'Messwert']
    measurements = measurements.reset_index(drop=True)

    # Allgemeine Infos an jede Zeile anfügen
    for key, value in general_info.items():
        measurements[key] = value

    all_dataframes.append(measurements)

# zusammenfügen
final_df = pd.concat(all_dataframes, ignore_index=True)

In [64]:
def optimize_object_columns(df, convert_numeric=False):
    initial_memory = df.memory_usage(deep=True).sum() / 1024 ** 2

    for col in df.select_dtypes(include=['object']).columns:
        num_unique_values = df[col].nunique()
        num_total_values = len(df[col])
        
        # Wenn viele Wiederholungen, konvertiere zu category
        if num_unique_values / num_total_values < 0.5:
            df[col] = df[col].astype('category')
        elif convert_numeric:
            # Versuche Konvertierung zu float/int (z. B. "123" → 123)
            try:
                df[col] = pd.to_numeric(df[col])
            except (ValueError, TypeError):
                df[col] = df[col].astype('string')
        else:
            df[col] = df[col].astype('string')

    optimized_memory = df.memory_usage(deep=True).sum() / 1024 ** 2
    print(f"Speicher vor Optimierung: {initial_memory:.2f} MB")
    print(f"Speicher nach Optimierung: {optimized_memory:.2f} MB")
    print(f"Reduziert um: {(initial_memory - optimized_memory):.2f} MB")

    return df

optimized_df = optimize_object_columns(final_df, convert_numeric=True)

Speicher vor Optimierung: 4765.96 MB
Speicher nach Optimierung: 185.95 MB
Reduziert um: 4580.02 MB


In [65]:
optimized_df.to_csv("AlleMessdaten2000_2024.csv")