In [None]:
# Production ETL Configuration
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from datetime import datetime, timedelta
import json

# Initialize Spark with production settings
spark = SparkSession.builder \
    .appName("ProductionDailyETL") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .getOrCreate()

# Production configuration
PROD_CONFIG = {
    "environment": "PRODUCTION",
    "run_id": datetime.now().strftime("%Y%m%d_%H%M%S"),
    "source_lakehouse": "ProductionData",
    "retry_count": 3,
    "alert_on_failure": True
}

print(f"üè≠ Production ETL Started")
print(f"üìÖ Run ID: {PROD_CONFIG['run_id']}")
print(f"‚öôÔ∏è Environment: {PROD_CONFIG['environment']}")

## üîê Data Quality Gates

In [None]:
class DataQualityChecker:
    """Production data quality validation framework"""
    
    def __init__(self, df, name):
        self.df = df
        self.name = name
        self.checks_passed = []
        self.checks_failed = []
    
    def check_row_count(self, min_rows=1):
        """Ensure minimum row count"""
        count = self.df.count()
        if count >= min_rows:
            self.checks_passed.append(f"Row count: {count} >= {min_rows}")
            return True
        self.checks_failed.append(f"Row count: {count} < {min_rows}")
        return False
    
    def check_null_percentage(self, column, max_null_pct=5):
        """Check null percentage for a column"""
        total = self.df.count()
        nulls = self.df.filter(col(column).isNull()).count()
        null_pct = (nulls / total) * 100 if total > 0 else 0
        
        if null_pct <= max_null_pct:
            self.checks_passed.append(f"{column} null%: {null_pct:.2f}% <= {max_null_pct}%")
            return True
        self.checks_failed.append(f"{column} null%: {null_pct:.2f}% > {max_null_pct}%")
        return False
    
    def check_unique(self, column):
        """Verify column uniqueness"""
        total = self.df.count()
        unique = self.df.select(column).distinct().count()
        
        if total == unique:
            self.checks_passed.append(f"{column} is unique")
            return True
        self.checks_failed.append(f"{column} has duplicates: {total - unique}")
        return False
    
    def report(self):
        """Generate quality report"""
        print(f"\nüìã Data Quality Report: {self.name}")
        print("=" * 50)
        print(f"‚úÖ Passed: {len(self.checks_passed)}")
        for check in self.checks_passed:
            print(f"   ‚Ä¢ {check}")
        print(f"‚ùå Failed: {len(self.checks_failed)}")
        for check in self.checks_failed:
            print(f"   ‚Ä¢ {check}")
        
        return len(self.checks_failed) == 0

print("‚úÖ Data Quality Framework initialized")

## üìä Load and Validate Source Data

In [None]:
# Load production data
df_transactions = spark.read.format("delta").load("Tables/sales_transactions")

# Run quality checks
checker = DataQualityChecker(df_transactions, "Sales Transactions")
checker.check_row_count(min_rows=10)
checker.check_null_percentage("customer_id", max_null_pct=1)
checker.check_null_percentage("total_amount", max_null_pct=0)
checker.check_unique("transaction_id")

quality_passed = checker.report()

if not quality_passed:
    raise Exception("‚ùå Data quality checks failed! Pipeline halted.")

## üîÑ Incremental Processing

In [None]:
# Get watermark for incremental load
try:
    watermark_df = spark.read.format("delta").load("Tables/etl_watermarks")
    last_watermark = watermark_df.filter(col("pipeline_name") == "daily_etl") \
        .select(max("watermark_value")).collect()[0][0]
except:
    last_watermark = "1900-01-01"

print(f"üìç Last watermark: {last_watermark}")

# Filter for new/updated records
df_incremental = df_transactions.filter(
    col("processed_timestamp") > last_watermark
)

new_records = df_incremental.count()
print(f"üìà New records to process: {new_records}")

if new_records == 0:
    print("‚ÑπÔ∏è No new records to process. Pipeline complete.")

## üì§ Update Production Tables

In [None]:
# Create aggregated views for reporting
if new_records > 0:
    # Daily summary
    daily_summary = df_transactions.groupBy(
        to_date(col("transaction_date")).alias("date")
    ).agg(
        count("*").alias("transaction_count"),
        sum("total_amount").alias("daily_revenue"),
        countDistinct("customer_id").alias("unique_customers"),
        avg("total_amount").alias("avg_transaction")
    ).orderBy("date")
    
    # Write to production tables
    daily_summary.write \
        .format("delta") \
        .mode("overwrite") \
        .save("Tables/daily_revenue_summary")
    
    print("‚úÖ Production tables updated successfully")

# Update watermark
new_watermark = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"üìç New watermark: {new_watermark}")

## üìß Pipeline Completion Report

In [None]:
# Generate completion report
completion_report = {
    "pipeline_name": "Production Daily ETL",
    "run_id": PROD_CONFIG['run_id'],
    "status": "SUCCESS",
    "records_processed": new_records,
    "quality_checks_passed": quality_passed,
    "completion_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
}

print("\n" + "=" * 60)
print("üè≠ PRODUCTION ETL COMPLETION REPORT")
print("=" * 60)
for key, value in completion_report.items():
    print(f"  {key}: {value}")
print("=" * 60)
print("\nüéâ Production ETL completed successfully!")