In [3]:
"""
GOLD PHASE - ECONOMIC EXTENSION ONLY - FIXED VERSION
Fixes ambiguous column reference error
"""

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from datetime import datetime

spark = SparkSession.builder.getOrCreate()
print(f"Started Gold Economic Extension (FIXED): {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*80)

# ============================================================================
# 1. CHECK EXISTING GOLD TABLES
# ============================================================================
print("üìã CHECKING EXISTING GOLD TABLES...")
print("-" * 80)

all_tables = [t.tableName for t in spark.sql("SHOW TABLES").collect()]
gold_tables = [t for t in all_tables if t.startswith('gold_')]

print(f"You have {len(gold_tables)} Gold tables:")
for table in sorted(gold_tables):
    try:
        count = spark.table(table).count()
        print(f"  ‚Ä¢ {table:30} {count:>10,} rows")
    except:
        print(f"  ‚Ä¢ {table:30} {'ERROR':>10}")

# ============================================================================
# 2. CHECK NEW SILVER TABLES
# ============================================================================
print("\nüì• CHECKING NEW ECONOMIC SILVER TABLES...")
print("-" * 80)

silver_tables_needed = ["silver_gdp_cleaned", "silver_fx_cleaned"]
for table in silver_tables_needed:
    try:
        count = spark.table(table).count()
        print(f"‚úÖ {table:25} {count:>10,} rows")
    except:
        print(f"‚ùå {table:25} MISSING - Run Silver transformation first!")

# ============================================================================
# 3. CREATE GOLD DIMENSION: GDP
# ============================================================================
print("\n" + "="*80)
print("üåç CREATING GOLD DIMENSION: GDP")
print("="*80)

if "gold_dim_gdp" not in gold_tables and "silver_gdp_cleaned" in all_tables:
    try:
        # Read from Silver
        silver_gdp = spark.table("silver_gdp_cleaned")
        print(f"Silver GDP loaded: {silver_gdp.count():,} rows")
        
        # Create Gold dimension
        gold_dim_gdp = silver_gdp.select(
            monotonically_increasing_id().alias("gdp_key"),
            col("year"),
            col("country_code"),
            col("country_name"),
            col("gdp_usd"),
            col("indicator_name"),
            col("source"),
            current_timestamp().alias("created_at")
        ).orderBy("year")
        
        # Save to Gold
        gold_dim_gdp.write.mode("overwrite").format("delta").saveAsTable("gold_dim_gdp")
        print(f"‚úÖ Created gold_dim_gdp: {gold_dim_gdp.count():,} rows")
        gold_dim_gdp.show(5, truncate=False)
        
    except Exception as e:
        print(f"‚ùå Error creating gold_dim_gdp: {str(e)[:200]}")
else:
    print(f"‚ÑπÔ∏è gold_dim_gdp {'already exists' if 'gold_dim_gdp' in gold_tables else 'needs silver_gdp_cleaned'}")

# ============================================================================
# 4. VERIFY GOLD DIMENSION: FX
# ============================================================================
print("\n" + "="*80)
print("üí± VERIFYING GOLD DIMENSION: FX")
print("="*80)

if "gold_dim_fx" in gold_tables:
    try:
        gold_fx = spark.table("gold_dim_fx")
        print(f"‚úÖ gold_dim_fx: {gold_fx.count():,} rows")
        print(f"   Date range: {gold_fx.select(min('rate_date')).collect()[0][0]} to {gold_fx.select(max('rate_date')).collect()[0][0]}")
    except Exception as e:
        print(f"‚ùå Error checking gold_dim_fx: {str(e)[:200]}")
else:
    print("‚ÑπÔ∏è gold_dim_fx not found in Gold layer")

# ============================================================================
# 5. FIX GOLD FACT: REVENUE EUR - COMPLETELY FIXED VERSION
# ============================================================================
print("\n" + "="*80)
print("üí∞ FIXING GOLD FACT: REVENUE EUR")
print("="*80)

if "gold_fact_revenue_eur" in gold_tables and "gold_dim_fx" in gold_tables:
    try:
        # Load tables
        gold_revenue = spark.table("gold_fact_revenue_eur")
        gold_fx = spark.table("gold_dim_fx")
        gold_taxi = spark.table("gold_fact_taxi_daily")
        dim_date = spark.table("gold_dim_date")
        
        print(f"Current gold_fact_revenue_eur: {gold_revenue.count():,} rows")
        
        # Check NULL EUR values
        null_eur_count = gold_revenue.filter(col("total_revenue_eur").isNull()).count()
        print(f"Rows with NULL EUR revenue: {null_eur_count:,} ({null_eur_count/gold_revenue.count()*100:.1f}%)")
        
        if null_eur_count > 0:
            print("üîÑ Fixing NULL EUR values...")
            
            # FIXED: Use bracket notation for column references
            # Add date_key to FX table for easier joining
            fx_with_key = gold_fx.withColumn(
                "fx_date_key",
                date_format(col("rate_date"), "yyyyMMdd").cast("integer")
            )
            
            # Join with explicit column references using bracket notation
            taxi_with_dates = gold_taxi.join(
                dim_date,
                gold_taxi["date_key"] == dim_date["date_key"],
                "inner"
            )
            
            # Now join with FX using explicit references
            fixed_revenue = taxi_with_dates.join(
                fx_with_key,
                taxi_with_dates["date_key"] == fx_with_key["fx_date_key"],
                "left"
            ).select(
                taxi_with_dates["date_key"].alias("date_key"),
                dim_date["calendar_date"].alias("trip_date"),
                taxi_with_dates["total_trips"].alias("trip_count"),
                taxi_with_dates["total_fare"].alias("total_revenue_usd"),
                when(
                    fx_with_key["usd_eur_rate"].isNotNull(),
                    round(taxi_with_dates["total_fare"] / fx_with_key["usd_eur_rate"], 2)
                ).otherwise(None).alias("total_revenue_eur"),
                taxi_with_dates["avg_fare"].alias("avg_fare_usd"),
                when(
                    fx_with_key["usd_eur_rate"].isNotNull(),
                    round(taxi_with_dates["avg_fare"] / fx_with_key["usd_eur_rate"], 2)
                ).otherwise(None).alias("avg_fare_eur"),
                fx_with_key["usd_eur_rate"].alias("conversion_rate"),
                when(
                    fx_with_key["usd_eur_rate"].isNotNull(),
                    round(
                        (taxi_with_dates["total_fare"] / fx_with_key["usd_eur_rate"] - taxi_with_dates["total_fare"]) / 
                        taxi_with_dates["total_fare"] * 100, 
                        2
                    )
                ).otherwise(None).alias("revenue_variance_pct")
            ).orderBy("date_key")
            
            # Save fixed version
            fixed_revenue.write.mode("overwrite").format("delta").saveAsTable("gold_fact_revenue_eur")
            print(f"‚úÖ Fixed gold_fact_revenue_eur: {fixed_revenue.count():,} rows")
            print("Fixed revenue sample:")
            fixed_revenue.show(5, truncate=False)
        else:
            print("‚úÖ All EUR values already populated - no fix needed")
            
    except Exception as e:
        print(f"‚ùå Error fixing revenue: {str(e)}")
else:
    missing = []
    if "gold_fact_revenue_eur" not in gold_tables:
        missing.append("gold_fact_revenue_eur")
    if "gold_dim_fx" not in gold_tables:
        missing.append("gold_dim_fx")
    print(f"‚ÑπÔ∏è Cannot fix revenue - missing: {missing}")

# ============================================================================
# 6. CREATE ECONOMIC CONTEXT TABLE - FIXED VERSION
# ============================================================================
print("\n" + "="*80)
print("üìà CREATING ECONOMIC CONTEXT TABLE")
print("="*80)

try:
    # Load tables
    dim_date = spark.table("gold_dim_date")
    dim_gdp = spark.table("gold_dim_gdp")
    fact_taxi = spark.table("gold_fact_taxi_daily")
    
    # Create economic context with explicit column references
    economic_context = fact_taxi.join(
        dim_date,
        fact_taxi["date_key"] == dim_date["date_key"],
        "inner"
    ).join(
        dim_gdp,
        dim_date["year"] == dim_gdp["year"],
        "left"
    ).select(
        fact_taxi["date_key"].alias("date_key"),
        dim_date["year"].alias("year"),
        dim_date["month"].alias("month"),
        fact_taxi["total_trips"].alias("total_trips"),
        fact_taxi["total_fare"].alias("daily_revenue_usd"),
        dim_gdp["gdp_usd"].alias("annual_gdp_usd"),
        when(
            dim_gdp["gdp_usd"].isNotNull(),
            round(fact_taxi["total_fare"] * 365 / dim_gdp["gdp_usd"] * 100, 6)
        ).otherwise(None).alias("daily_revenue_as_%_of_gdp"),
        dim_gdp["country_name"].alias("country_name"),
        current_timestamp().alias("calculated_at")
    ).filter(
        dim_gdp["gdp_usd"].isNotNull()
    ).distinct().orderBy("date_key")
    
    # Save if not empty
    if economic_context.count() > 0:
        economic_context.write.mode("overwrite").format("delta").saveAsTable("gold_fact_economic_context")
        print(f"‚úÖ Created gold_fact_economic_context: {economic_context.count():,} rows")
        economic_context.show(5, truncate=False)
    else:
        print("‚ÑπÔ∏è No overlapping data for economic context")
        
except Exception as e:
    print(f"‚ùå Error creating economic context: {str(e)[:200]}")

# ============================================================================
# 7. FINAL VERIFICATION
# ============================================================================
print(f"""
{'='*80}
‚úÖ GOLD ECONOMIC EXTENSION COMPLETE!
{'='*80}

üìä FINAL GOLD LAYER INVENTORY:
""")

final_gold = [t for t in spark.sql("SHOW TABLES").collect() if t.tableName.startswith('gold_')]
for table in sorted(final_gold, key=lambda x: x.tableName):
    try:
        df = spark.table(table.tableName)
        print(f"‚Ä¢ {table.tableName:35} {df.count():>10,} rows")
    except:
        print(f"‚Ä¢ {table.tableName:35} {'ERROR':>10}")

print(f"""
üéØ PROJECT COMPLETION STATUS:
‚Ä¢ ‚úÖ Bronze Layer: Complete
‚Ä¢ ‚úÖ Silver Layer: Complete  
‚Ä¢ ‚úÖ Gold Layer: Complete with economic data
‚Ä¢ ‚úÖ All 4 data sources integrated (Taxi, Air Quality, GDP, FX)
‚Ä¢ ‚úÖ Star schema ready for analytics

üìà READY FOR PHASE 4 - ANALYTICS & VISUALIZATION:

Build these Power BI dashboards:
1. Mobility Dashboard - Taxi trips, revenue, patterns
2. Air Quality Dashboard - Pollution trends, hotspots  
3. Economic Impact Dashboard - USD/EUR revenue, GDP context
4. Correlation Dashboard - Taxi vs. Pollution vs. Economy

üîó GOLD TABLES READY FOR POWER BI:
‚Ä¢ gold_fact_taxi_daily - Daily taxi metrics
‚Ä¢ gold_fact_air_quality_daily - Daily pollution levels
‚Ä¢ gold_fact_revenue_eur - Revenue in USD/EUR
‚Ä¢ gold_fact_economic_context - Revenue in GDP context
‚Ä¢ gold_bridge_taxi_air_quality - Correlation analysis

üèÅ Completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
{'='*80}
""")

StatementMeta(, 4a26e0e0-7953-4807-a188-1a2488265e8a, 5, Finished, Available, Finished)

Started Gold Economic Extension (FIXED): 2025-12-20 16:20:00
üìã CHECKING EXISTING GOLD TABLES...
--------------------------------------------------------------------------------
You have 12 Gold tables:
  ‚Ä¢ gold_bridge_taxi_air_quality           37 rows
  ‚Ä¢ gold_dim_date                         307 rows
  ‚Ä¢ gold_dim_fx                         6,904 rows
  ‚Ä¢ gold_dim_gdp                           50 rows
  ‚Ä¢ gold_dim_location                   1,896 rows
  ‚Ä¢ gold_dim_pollutant                      4 rows
  ‚Ä¢ gold_dim_taxi_zone                    262 rows
  ‚Ä¢ gold_dim_zone                         514 rows
  ‚Ä¢ gold_fact_air_quality_daily         1,732 rows
  ‚Ä¢ gold_fact_economic_context             35 rows
  ‚Ä¢ gold_fact_revenue_eur                  35 rows
  ‚Ä¢ gold_fact_taxi_daily                   35 rows

üì• CHECKING NEW ECONOMIC SILVER TABLES...
--------------------------------------------------------------------------------
‚úÖ silver_gdp_cleaned           