In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
import pandas as pd
import time

start_time = time.time()

%run "/usr/local/spark/notebooks/00-spark-connection.ipynb"

# Read enriched departures from Parquet file
df = spark.read.parquet("data/enriched_01.parquet")
print(f"Loaded Parquet data in {time.time() - start_time:.2f} seconds")

In [None]:
start_time = time.time()
avg_delay = df.agg({'delay': 'mean'}).collect()[0][0]
execution_duration = time.time() - start_time

print(f"Durchschnittliche Verspätung: {avg_delay} Sekunden")
print(f"-- Ausführungsdauer: {round(execution_duration, 2)} Sekunden")

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Verspätungswerte zählen
delay_counts = df.groupBy("delay").count().orderBy("delay").collect()

# Daten für das Tortendiagramm vorbereiten
total_count = sum(row['count'] for row in delay_counts)
threshold = 0.03 * total_count  # 3% Schwelle

labels = []
sizes = []
other_count = 0

delay_data = []

for row in delay_counts:
    delay = row['delay']
    count = row['count']
    percentage = count / total_count * 100
    delay_data.append({"Verspätung (s)": delay, "Anzahl": count, "Prozent": percentage})
    
    if count >= threshold:
        labels.append(f"{delay}s" if isinstance(delay, (int, float)) else str(delay))
        sizes.append(count)
    else:
        other_count += count

if other_count > 0:
    labels.append("Other")
    sizes.append(other_count)

# Tortendiagramm erstellen
plt.figure(figsize=(12, 8))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
plt.axis('equal')
plt.title('Verteilung der Verspätungen')
plt.show()

# Erstellen und Anzeigen des DataFrames
df_delays = pd.DataFrame(delay_data)
pd.set_option('display.max_rows', None)  # Zeige alle Zeilen
print(df_delays.to_string(index=False, float_format=lambda x: f"{x:.2f}"))

print(f"\nGesamtanzahl der Datensätze: {total_count}")
print(f"Anzahl unterschiedlicher Verspätungswerte: {len(delay_counts)}")