#Modul 4: In- und Output Operationen

## 4.1. Setup und Dataset laden

In [0]:
%run "./Helper/_config"

In [0]:
# DBFS Pfad
DATA_PATH = f"{CATALOG}.{SCHEMA}.yellow_tripdata_2025_01"
LOOKUP_PATH =  f"{CATALOG}.{SCHEMA}.taxi_zone_lookup"

# DataFrame laden
df_taxi = spark.read.table(DATA_PATH)
df_lookup = spark.read.table(LOOKUP_PATH)

spark.sql(f"DROP VOLUME {CATALOG}.{SCHEMA}.taxi_volume;")
spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.taxi_volume;")

VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/taxi_volume/"

## 4.2. Speicherung im Parquet-Format

### 4.2.1. Speichern

In [0]:
# Taxi-Daten als Parquet im Managed Volume speichern
df_taxi.write.mode("overwrite").parquet(f"{VOLUME_PATH}/taxi_parquet")

# Lookup-Daten als Parquet speichern
df_lookup.write.mode("overwrite").parquet(f"{VOLUME_PATH}/lookup_parquet")


### 4.2.2. Auslesen

In [0]:
df_taxi_parquet = spark.read.parquet(f"{VOLUME_PATH}/taxi_parquet")
df_lookup_parquet = spark.read.parquet(f"{VOLUME_PATH}/lookup_parquet")

df_taxi_parquet.show(5)
df_lookup_parquet.show(5)


##4.3. Speicherung im CSV Format

### 4.3.1. Speichern

In [0]:
# Taxi-Daten als CSV speichern
df_taxi.write.mode("overwrite").option("header", True).csv(f"{VOLUME_PATH}/taxi_csv")

# Lookup-Daten als CSV speichern
df_lookup.write.mode("overwrite").option("header", True).csv(f"{VOLUME_PATH}/lookup_csv")


### 4.3.2. Auslesen

In [0]:
df_taxi_csv = spark.read.option("header", True).csv(f"{VOLUME_PATH}/taxi_csv")
df_lookup_csv = spark.read.option("header", True).csv(f"{VOLUME_PATH}/lookup_csv")

df_taxi_csv.show(5)
df_lookup_csv.show(5)


##4.4. Speichern im Delta Format

###4.4.1. Speichern

In [0]:
# Taxi-Daten als Delta speichern
df_taxi.write.format("delta").mode("overwrite").save(f"{VOLUME_PATH}/taxi_delta")

# Lookup-Daten als Delta speichern
df_lookup.write.format("delta").mode("overwrite").save(f"{VOLUME_PATH}/lookup_delta")


###4.4.2. Auslesen

In [0]:
df_taxi_delta = spark.read.format("delta").load(f"{VOLUME_PATH}/taxi_delta")
df_lookup_delta = spark.read.format("delta").load(f"{VOLUME_PATH}/lookup_delta")

df_taxi_delta.show(5)
df_lookup_delta.show(5)


##4.5. Speichern im JSON Format

### 4.5.1 Speichern

In [0]:
# Taxi-Daten als JSON speichern
df_taxi.write.mode("overwrite").json(f"{VOLUME_PATH}/taxi_json")

# Lookup-Daten als JSON speichern
df_lookup.write.mode("overwrite").json(f"{VOLUME_PATH}/lookup_json")


###4.5.2. Einlesen

In [0]:
df_taxi_json = spark.read.json(f"{VOLUME_PATH}/taxi_json")
df_lookup_json = spark.read.json(f"{VOLUME_PATH}/lookup_json")

df_taxi_json.show(5)
df_lookup_json.show(5)
