In [None]:
# ‚ùå MAI usare pandas - usa sempre Spark!
# ‚ùå SBAGLIATO: df.toPandas().groupby('col').sum()
# ‚úÖ CORRETTO: df.groupBy('col').sum()

# Partizionamento per performance
df.write.partitionBy("year", "month").parquet("path/to/data")

# Broadcast join per tabelle piccole (<200MB)
from pyspark.sql.functions import broadcast
large_df.join(broadcast(small_df), "key").show()

# Repartition vs Coalesce
df.repartition(200)  # Redistribuisce uniformemente (costoso)
df.coalesce(50)      # Riduce partizioni senza shuffle (veloce)

# Cache su storage tiers
df.cache()                           # MEMORY_AND_DISK
df.persist(StorageLevel.DISK_ONLY)   # Solo disco
df.persist(StorageLevel.MEMORY_ONLY) # Solo memoria

# Checkpoint per interrompere lineage lunghi
df.checkpoint()  # Salva su disco e tronca lineage

# Configurazioni cluster per performance
spark.conf.set("spark.sql.adaptive.enabled", "true")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")
spark.conf.set("spark.sql.adaptive.skewJoin.enabled", "true")

## üöÄ Performance Tips (Solo Spark!)

In [None]:
# Slowly Changing Dimension (SCD Type 2)
from pyspark.sql.functions import current_timestamp, when, col

def scd_type2_merge(target_table, source_df, key_cols, compare_cols):
    # Identifica record cambiati
    changed_records = source_df.join(
        target_table.filter(col("is_current") == True),
        key_cols
    ).where(
        # Confronta colonne per trovare cambiamenti
        reduce(lambda x, y: x | y, [col(f"source.{c}") != col(f"target.{c}") for c in compare_cols])
    )
    
    # Chiudi record vecchi
    target_table.alias("target").merge(
        changed_records.alias("source"),
        " AND ".join([f"target.{k} = source.{k}" for k in key_cols])
    ).whenMatchedUpdate(set={
        "is_current": lit(False),
        "end_date": current_timestamp()
    }).execute()
    
    # Inserisci nuovi record
    new_records = changed_records.select("source.*").withColumn("start_date", current_timestamp()).withColumn("is_current", lit(True))
    target_table.alias("target").merge(
        new_records.alias("source"),
        "1=2"  # Never match, always insert
    ).whenNotMatchedInsertAll().execute()

# Window Functions per ranking
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, rank, dense_rank

window_spec = Window.partitionBy("customer_id").orderBy(desc("order_date"))
df.withColumn("row_num", row_number().over(window_spec)).show()

# Deduplicazione
df.dropDuplicates(["customer_id", "email"]).show()

## üîÑ ETL Patterns Avanzati

In [None]:
# Mount Azure Data Lake Storage (ADLS)
configs = {
    "fs.azure.account.auth.type": "OAuth",
    "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
    "fs.azure.account.oauth2.client.id": "your_client_id",
    "fs.azure.account.oauth2.client.secret": "your_client_secret",
    "fs.azure.account.oauth2.client.endpoint": "https://login.microsoftonline.com/your_tenant_id/oauth2/token"
}

dbutils.fs.mount(
    source = "abfss://container@storageaccount.dfs.core.windows.net/",
    mount_point = "/mnt/datalake",
    extra_configs = configs
)

# Mount AWS S3
dbutils.fs.mount(
    source = "s3a://your-bucket-name",
    mount_point = "/mnt/s3bucket",
    extra_configs = {
        "fs.s3a.access.key": "your_access_key",
        "fs.s3a.secret.key": "your_secret_key"
    }
)

# Vedere mount points
dbutils.fs.mounts()

# Unmount
dbutils.fs.unmount("/mnt/datalake")

## üîó Storage Mounting (ADLS/S3)

In [None]:
# Salva come Delta Table
df.write.format("delta").saveAsTable("my_database.delta_table")

# Merge (UPSERT) - aggiorna se esiste, inserisce se nuovo
from delta.tables import DeltaTable

delta_table = DeltaTable.forName(spark, "my_database.customers")

delta_table.alias("target").merge(
    new_data.alias("source"),
    "target.customer_id = source.customer_id"
).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()

# Time Travel - versioni precedenti
spark.sql("SELECT * FROM my_database.customers VERSION AS OF 1").show()
spark.sql("SELECT * FROM my_database.customers TIMESTAMP AS OF '2024-01-01'").show()

# Ottimizzare tabella (compatta piccoli file)
spark.sql("OPTIMIZE my_database.customers")

# Z-Order per performance query
spark.sql("OPTIMIZE my_database.customers ZORDER BY (customer_id)")

# Vacuum - rimuove file vecchi (default 7 giorni)
spark.sql("VACUUM my_database.customers")

## üî∫ Delta Lake Operations

In [None]:
# Creare database
spark.sql("CREATE DATABASE IF NOT EXISTS my_database")

# Usare database
spark.sql("USE my_database")

# Creare tabella da DataFrame
df.write.saveAsTable("my_database.customers")

# Creare tabella vuota con schema
spark.sql("""
    CREATE TABLE IF NOT EXISTS my_database.orders (
        order_id INT,
        customer_id INT,
        order_date DATE,
        amount DECIMAL(10,2)
    )
""")

# Vedere tabelle disponibili
spark.sql("SHOW TABLES IN my_database").show()

# Descrivere tabella
spark.sql("DESCRIBE my_database.customers").show()

## üóÑÔ∏è Unity Catalog - Database e Tabelle

In [None]:
# 1. Leggi i dati
df = spark.read.csv("path/to/data.csv", header=True, inferSchema=True)

# 2. Esplora
df.printSchema()
df.show(5)
print(f"Righe totali: {df.count()}")

# 3. Pulisci (se necessario)
df_clean = df.filter(df.column.isNotNull())

# 4. Trasforma
df_transformed = df_clean.withColumn("new_col", col("old_col") * 2)

# 5. Analizza
result = df_transformed.groupBy("category").agg(
    count("*").alias("count"),
    avg("value").alias("avg_value")
)

# 6. Mostra risultato
result.show()

## üéØ Pattern Comuni per Esercizi

### üìù Template Base Esercizio

In [None]:
# Registra DataFrame come vista temporanea
df.createOrReplaceTempView("my_data")

# Ora puoi usare SQL
result = spark.sql("SELECT * FROM my_data WHERE age > 25")
result.show()

# Query complesse
spark.sql("""
    SELECT category, 
           COUNT(*) as total_records,
           AVG(price) as avg_price
    FROM my_data 
    GROUP BY category
    ORDER BY avg_price DESC
""").show()

## üóÇÔ∏è SQL Magic

In [None]:
# Salva come tabella
df.write.saveAsTable("my_database.my_table")

# Salva file Parquet
df.write.parquet("path/to/output.parquet")

# Salva CSV
df.write.csv("path/to/output.csv", header=True)

# Modalit√† di scrittura
df.write.mode("overwrite").saveAsTable("table")   # Sovrascrive
df.write.mode("append").saveAsTable("table")      # Aggiunge

# Cache per performance
df.cache()                    # Mantiene in memoria
df.persist()                  # Pi√π controllo storage
df.unpersist()               # Rimuove dalla cache

## üíæ Salvataggio

In [None]:
# Join tipi
df1.join(df2, "common_column").show()                    # Inner join
df1.join(df2, "common_column", "inner").show()           # Inner join esplicito
df1.join(df2, "common_column", "left").show()            # Left join
df1.join(df2, "common_column", "right").show()           # Right join
df1.join(df2, "common_column", "outer").show()           # Full outer join

# Join con colonne diverse
df1.join(df2, df1.id == df2.user_id).show()

# Join multipli
df1.join(df2, "id").join(df3, "category_id").show()

## üîó Join

In [None]:
# Group By base
df.groupBy("category").count().show()
df.groupBy("category").sum("amount").show()
df.groupBy("category").avg("price").show()
df.groupBy("category").max("date").show()
df.groupBy("category").min("date").show()

# Aggregazioni multiple
df.groupBy("category").agg(
    count("*").alias("total_records"),
    sum("amount").alias("total_amount"),
    avg("price").alias("avg_price")
).show()

# Senza groupBy
df.agg(count("*"), sum("amount"), avg("price")).show()

## üìä Aggregazioni

In [None]:
from pyspark.sql.functions import *

# Aggiungere/modificare colonne
df.withColumn("new_col", lit("value")).show()          # Colonna costante
df.withColumn("age_plus_10", col("age") + 10).show()   # Calcolo
df.withColumnRenamed("old_name", "new_name").show()    # Rinomina

# Ordinamento
df.orderBy("age").show()                # Crescente
df.orderBy(desc("age")).show()          # Decrescente

# Drop colonne
df.drop("unwanted_col").show()

# Cast tipi
df.withColumn("age_string", col("age").cast("string")).show()

## üîÑ Trasformazioni

In [None]:
# Selezione colonne
df.select("col1", "col2").show()
df.select("*").show()                    # Tutte le colonne

# Filtri
df.filter(df.age > 18).show()           # Condizione semplice
df.filter((df.age > 18) & (df.city == "Rome")).show()  # AND
df.filter((df.age < 18) | (df.age > 65)).show()        # OR

# Where (identico a filter)
df.where(df.status == "active").show()

# Limit
df.limit(100).show()                    # Prime 100 righe

## üéØ Selezione e Filtri

In [None]:
# Schema e info base
df.printSchema()           # Struttura colonne
df.show()                  # Prime 20 righe
df.show(5)                 # Prime 5 righe
df.count()                 # Numero righe
df.columns                 # Lista colonne

# Statistiche descrittive
df.describe().show()       # Count, mean, stddev, min, max
df.summary().show()        # Pi√π dettagliate

# Info specifiche colonne
df.select("column_name").distinct().show()  # Valori unici
df.groupBy("column").count().show()         # Conteggi per gruppo

## üîç Esplorazione Dati

In [None]:
# Lettura file CSV
df = spark.read.csv("path/to/file.csv", header=True, inferSchema=True)

# Lettura JSON
df = spark.read.json("path/to/file.json")

# Lettura Parquet
df = spark.read.parquet("path/to/file.parquet")

# Da database
df = spark.read.table("database.table_name")

# üîß Databricks - Funzioni Essenziali per Esercizi

## üìä Lettura Dati

# üìä Databricks: Da Noob a Worker

## üéØ Roadmap Completa per Diventare Esperto Databricks

### üìã Indice:
1. **Setup & Basics** - Ambiente e concetti base
2. **Apache Spark Fundamentals** - Core engine 
3. **Data Engineering** - ETL e pipeline
4. **Machine Learning** - MLflow e AutoML
5. **Advanced Topics** - Performance tuning
6. **Best Practices** - Patterns produttivi

# üìä Databricks: Da Noob a Worker

## üéØ Roadmap Completa per Diventare Esperto Databricks

### üìã Indice:
1. **Setup & Basics** - Ambiente e concetti base
2. **Apache Spark Fundamentals** - Core engine 
3. **Data Engineering** - ETL e pipeline
4. **Machine Learning** - MLflow e AutoML
5. **Advanced Topics** - Performance tuning
6. **Best Practices** - Patterns produttivi

In [None]:
spark.read.json("data/retail-data/all/").printSchema()