In [6]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder.appName("SchemaMismatchResolution").getOrCreate()

# Define the expected schema
expected_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("salary", DoubleType(), True)
])

# Load the actual data
# Replace 'data.csv' with your actual data source
df = spark.read.csv("data.csv", header=True, inferSchema=True)

# Function to compare schemas
def compare_schemas(expected, actual):
    expected_fields = {field.name: field.dataType for field in expected.fields}
    actual_fields = {field.name: field.dataType for field in actual.fields}

    mismatched_fields = {}
    for field in expected_fields:
        if field in actual_fields:
            if expected_fields[field] != actual_fields[field]:
                mismatched_fields[field] = {
                    "expected": expected_fields[field],
                    "actual": actual_fields[field]
                }
        else:
            mismatched_fields[field] = {
                "expected": expected_fields[field],
                "actual": None
            }
    for field in actual_fields:
        if field not in expected_fields:
            mismatched_fields[field] = {
                "expected": None,
                "actual": actual_fields[field]
            }
    return mismatched_fields

# Compare schemas
mismatches = compare_schemas(expected_schema, df.schema)

# Display mismatches
if mismatches:
    print("Schema mismatches detected:")
    for field, types in mismatches.items():
        print(f"Field: {field}, Expected: {types['expected']}, Actual: {types['actual']}")
else:
    print("No schema mismatches detected.")

# Resolve mismatches by casting columns to expected types
for field, types in mismatches.items():
    if types['expected'] and types['actual']:
        try:
            df = df.withColumn(field, col(field).cast(types['expected']))
            print(f"Casted field '{field}' to {types['expected']}")
        except Exception as e:
            print(f"Error casting field '{field}': {e}")
    elif types['expected'] and not types['actual']:
        # Field missing in actual data, add with null values
        df = df.withColumn(field, col(field).cast(types['expected']))
        print(f"Added missing field '{field}' with type {types['expected']}")
    elif not types['expected'] and types['actual']:
        # Extra field in actual data, decide whether to drop or keep
        df = df.drop(field)
        print(f"Dropped unexpected field '{field}'")

# Proceed with further processing
df.show()

AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/content/data.csv.