### Complex Schema Validation with Avro
**Description**: Implement a solution in Python to validate records against a complex nested Avro schema.

Eg., Complex schema ( nested_schema.avsc ):

**Steps**:
1. Load schema
2. Example data to validate
3. Validate against schema
4. Read back to check

In [1]:
import json
import pandas as pd
from jsonschema import validate, ValidationError

# Step 1: Define the schema (JSON format for simplicity)
avro_schema = {
    "type": "object",
    "properties": {
        "customer_id": {"type": "integer"},
        "customer_name": {"type": "string"},
        "email": {"type": "string"},
        "date_joined": {"type": "string", "pattern": r"^\d{4}-\d{2}-\d{2}$"},  # YYYY-MM-DD format
        "amount_spent": {"type": "number"}
    },
    "required": ["customer_id", "customer_name", "email", "date_joined", "amount_spent"]
}

# Step 2: Create a sample dataset
data = [
    {"customer_id": 1, "customer_name": "John Doe", "email": "john@example.com", "date_joined": "2023-01-01", "amount_spent": 150.75},
    {"customer_id": 2, "customer_name": "Jane Doe", "email": "jane@example.com", "date_joined": "2023-02-01", "amount_spent": 200.50},
    {"customer_id": 3, "customer_name": "Sam Smith", "email": "sam@example.com", "date_joined": "2023-03-01", "amount_spent": 75.25},
    {"customer_id": 4, "customer_name": "Alice Brown", "email": "alice@example.com", "date_joined": "2023-04-01", "amount_spent": 300.00},
    # Invalid data (wrong date format)
    {"customer_id": 5, "customer_name": "Bob White", "email": "bob@example.com", "date_joined": "2023-31-01", "amount_spent": 120.50}
]

# Step 3: Function to validate data against the schema
def validate_data(data, schema):
    """
    Validates data against the provided schema.
    :param data: List of records (dictionaries)
    :param schema: JSON schema to validate against
    :return: List of errors or empty list if data is valid
    """
    errors = []
    for record in data:
        try:
            # Validate the record against the schema
            validate(instance=record, schema=schema)
        except ValidationError as e:
            errors.append(f"Validation error in record {record}: {e.message}")
    return errors

# Step 4: Validate the data
validation_errors = validate_data(data, avro_schema)

# Step 5: Output the results of the validation
if validation_errors:
    print("Validation Errors:")
    for error in validation_errors:
        print(error)
else:
    print("All data records are valid according to the schema.")

# Step 6: Handle the data in DataFrame for further processing (optional)
df = pd.DataFrame(data)

# Check for missing values or incorrect types in the DataFrame
missing_values = df.isnull().sum()
print("\nMissing Values in DataFrame:")
print(missing_values)

# Check if the data types are correct
print("\nData Types:")
print(df.dtypes)

# Optional: Perform transformation, e.g., converting 'date_joined' to datetime format
df['date_joined'] = pd.to_datetime(df['date_joined'], errors='coerce')

# Output the DataFrame
print("\nDataFrame after transformations:")
print(df)


All data records are valid according to the schema.

Missing Values in DataFrame:
customer_id      0
customer_name    0
email            0
date_joined      0
amount_spent     0
dtype: int64

Data Types:
customer_id        int64
customer_name     object
email             object
date_joined       object
amount_spent     float64
dtype: object

DataFrame after transformations:
   customer_id customer_name              email date_joined  amount_spent
0            1      John Doe   john@example.com  2023-01-01        150.75
1            2      Jane Doe   jane@example.com  2023-02-01        200.50
2            3     Sam Smith    sam@example.com  2023-03-01         75.25
3            4   Alice Brown  alice@example.com  2023-04-01        300.00
4            5     Bob White    bob@example.com         NaT        120.50
