# Apache Sedona - Ultimate FactoryException Fix

This notebook includes a robust Spark initialization that handles GeoAPI classpath issues.

**If you're getting FactoryException errors, run the cells in order.**

In [None]:
# Step 1: Check JAR availability and Java environment
import os
import glob
import subprocess

print("🔍 Environment Check:")
print(f"JAVA_HOME: {os.environ.get('JAVA_HOME', 'Not set')}")
print(f"SPARK_HOME: {os.environ.get('SPARK_HOME', 'Not set')}")

# Check for required JARs
spark_jars_dir = "/opt/spark/jars/"
if os.path.exists(spark_jars_dir):
    all_jars = glob.glob(f"{spark_jars_dir}*.jar")
    
    # Look for specific JAR types
    sedona_jars = [jar for jar in all_jars if 'sedona' in os.path.basename(jar).lower()]
    geoapi_jars = [jar for jar in all_jars if 'geoapi' in os.path.basename(jar).lower()]
    jts_jars = [jar for jar in all_jars if 'jts' in os.path.basename(jar).lower()]
    unit_jars = [jar for jar in all_jars if 'unit-api' in os.path.basename(jar).lower()]
    
    print(f"\n📦 JAR Status:")
    print(f"  Sedona JARs: {len(sedona_jars)} found")
    for jar in sedona_jars:
        print(f"    - {os.path.basename(jar)}")
    
    print(f"  GeoAPI JARs: {len(geoapi_jars)} found")
    for jar in geoapi_jars:
        print(f"    - {os.path.basename(jar)}")
    
    print(f"  JTS JARs: {len(jts_jars)} found")
    for jar in jts_jars:
        print(f"    - {os.path.basename(jar)}")
        
    print(f"  Unit API JARs: {len(unit_jars)} found")
    for jar in unit_jars:
        print(f"    - {os.path.basename(jar)}")
    
    # Check if we have the minimum required JARs
    if len(geoapi_jars) == 0:
        print("\n❌ Missing GeoAPI JARs! This will cause FactoryException.")
        print("💡 Run: ./ultimate-geoapi-fix.sh to fix this issue")
    else:
        print("\n✅ GeoAPI JARs found - should work now!")
else:
    print("❌ Spark jars directory not found")

In [None]:
# Step 2: Initialize Spark with explicit classpath configuration
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

def create_robust_spark_session():
    """Create Spark session with robust GeoAPI configuration"""
    
    # Build comprehensive Spark configuration
    conf = SparkConf()
    
    # Basic Spark settings
    conf.set("spark.app.name", "SedonaRobustExample")
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    conf.set("spark.kryo.registrator", "org.apache.sedona.core.serde.SedonaKryoRegistrator")
    conf.set("spark.sql.extensions", "org.apache.sedona.viz.sql.SedonaVizExtensions,org.apache.sedona.sql.SedonaSqlExtensions")
    
    # Memory settings
    conf.set("spark.driver.memory", "2g")
    conf.set("spark.executor.memory", "2g")
    conf.set("spark.driver.maxResultSize", "1g")
    
    # Adaptive query execution
    conf.set("spark.sql.adaptive.enabled", "true")
    conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")
    
    # Explicit JAR configuration (force loading of all JARs)
    jar_dir = "/opt/spark/jars/"
    if os.path.exists(jar_dir):
        # Get all JARs in the directory
        all_jars = glob.glob(f"{jar_dir}*.jar")
        jar_list = ",".join(all_jars)
        conf.set("spark.jars", jar_list)
        print(f"📦 Configured {len(all_jars)} JARs in classpath")
    
    # Create session
    spark = SparkSession.builder.config(conf=conf).getOrCreate()
    
    # Set log level
    spark.sparkContext.setLogLevel("WARN")
    
    return spark

# Create Spark session
print("🚀 Creating robust Spark session...")
try:
    spark = create_robust_spark_session()
    print("✅ Spark session created successfully!")
    print(f"   Spark Version: {spark.version}")
    print(f"   Application ID: {spark.sparkContext.applicationId}")
except Exception as e:
    print(f"❌ Failed to create Spark session: {e}")
    raise

In [None]:
# Step 3: Register Sedona with comprehensive error handling
def register_sedona_robust(spark):
    """Register Sedona with detailed error reporting"""
    
    try:
        from sedona.register import SedonaRegistrator
        
        print("🔧 Registering Sedona functions...")
        SedonaRegistrator.registerAll(spark)
        print("✅ Sedona functions registered successfully!")
        
        return True
        
    except Exception as e:
        print(f"❌ Failed to register Sedona: {e}")
        print("\nDetailed error:")
        import traceback
        traceback.print_exc()
        return False

# Register Sedona
sedona_registered = register_sedona_robust(spark)

if sedona_registered:
    print("\n🎯 Sedona is ready for use!")
else:
    print("\n⚠️ Sedona registration failed - check error messages above")

In [None]:
# Step 4: Progressive testing to isolate FactoryException
def test_sedona_progressively(spark):
    """Test Sedona functions progressively to identify where FactoryException occurs"""
    
    tests = [
        {
            'name': 'Basic Point Creation',
            'sql': 'SELECT ST_Point(1.0, 1.0) as point',
            'description': 'Creates a simple point geometry'
        },
        {
            'name': 'Point Coordinates',
            'sql': 'SELECT ST_X(ST_Point(-74.0, 40.7)) as x, ST_Y(ST_Point(-74.0, 40.7)) as y',
            'description': 'Extracts coordinates from point'
        },
        {
            'name': 'Distance Calculation',
            'sql': 'SELECT ST_Distance(ST_Point(0.0, 0.0), ST_Point(1.0, 1.0)) as distance',
            'description': 'Calculates Euclidean distance (may trigger FactoryException)'
        },
        {
            'name': 'Polygon Creation',
            'sql': 'SELECT ST_GeomFromWKT("POLYGON((0 0, 1 0, 1 1, 0 1, 0 0))") as polygon',
            'description': 'Creates polygon from WKT'
        },
        {
            'name': 'Point in Polygon',
            'sql': '''SELECT ST_Within(
                ST_Point(0.5, 0.5), 
                ST_GeomFromWKT("POLYGON((0 0, 1 0, 1 1, 0 1, 0 0))")
            ) as within''',
            'description': 'Tests spatial relationship (may trigger FactoryException)'
        }
    ]
    
    results = []
    
    for i, test in enumerate(tests, 1):
        print(f"\n🧪 Test {i}: {test['name']}")
        print(f"   Description: {test['description']}")
        print(f"   SQL: {test['sql']}")
        
        try:
            result = spark.sql(test['sql']).collect()
            print(f"   ✅ SUCCESS: {result[0]}")
            results.append({'test': test['name'], 'status': 'SUCCESS', 'result': str(result[0])})
            
        except Exception as e:
            error_msg = str(e)
            print(f"   ❌ FAILED: {error_msg[:100]}...")
            
            if 'FactoryException' in error_msg:
                print("   🎯 FOUND IT! This operation triggers FactoryException")
                print("   💡 This means GeoAPI JARs are still not properly loaded")
            
            results.append({'test': test['name'], 'status': 'FAILED', 'error': error_msg})
    
    return results

# Run progressive tests
if sedona_registered:
    print("🔬 Running progressive Sedona tests...")
    test_results = test_sedona_progressively(spark)
    
    # Summary
    successful = len([r for r in test_results if r['status'] == 'SUCCESS'])
    total = len(test_results)
    
    print(f"\n📊 Test Summary: {successful}/{total} tests passed")
    
    if successful == total:
        print("🎉 All tests passed! FactoryException is resolved!")
    else:
        print("⚠️ Some tests failed. Check the errors above.")
        print("💡 If you see FactoryException, run: ./ultimate-geoapi-fix.sh")
else:
    print("⏭️ Skipping tests because Sedona registration failed")

In [None]:
# Step 5: If tests pass, continue with your spatial analysis
if sedona_registered:
    print("🎯 Sedona is working! You can now continue with your spatial analysis.")
    print("\n📝 Example spatial operations you can try:")
    print("\n# Create sample spatial data")
    print("sample_data = spark.sql(\"\"\"")
    print("    SELECT ")
    print("        ST_Point(RAND() * 10 - 5, RAND() * 10 - 5) as geometry,")
    print("        CAST(RAND() * 100 AS INT) as value")
    print("    FROM range(100)")
    print("\"\"\")")")
    print("\nsample_data.show(5)")
else:
    print("❌ Cannot continue - fix Sedona registration first")
    print("\n🔧 Troubleshooting steps:")
    print("1. Run: ./ultimate-geoapi-fix.sh")
    print("2. Restart this notebook kernel")
    print("3. Re-run all cells from the beginning")