In [None]:
import pandas as pd
import os

# Define expected schema
expected_schema = {
    'id': 'int64',
    'name': 'object',
    'age': 'int64',
    'email': 'object'
}

# Path to the folder containing input CSVs
data_folder = 'data/input_files/'

def check_schema(file_path):
    df = pd.read_csv(file_path)
    actual_schema = dict(df.dtypes)

    # Normalize dtype for comparison
    actual_schema = {col: str(dtype) for col, dtype in actual_schema.items()}

    mismatches = {}
    for col, expected_dtype in expected_schema.items():
        actual_dtype = actual_schema.get(col)
        if actual_dtype != expected_dtype:
            mismatches[col] = {'expected': expected_dtype, 'actual': actual_dtype}

    # Extra columns
    extra_columns = set(actual_schema.keys()) - set(expected_schema.keys())
    missing_columns = set(expected_schema.keys()) - set(actual_schema.keys())

    return mismatches, extra_columns, missing_columns

# Iterate over files and validate
for file_name in os.listdir(data_folder):
    if file_name.endswith('.csv'):
        file_path = os.path.join(data_folder, file_name)
        mismatches, extra, missing = check_schema(file_path)

        if mismatches or extra or missing:
            print(f"\nSchema issues in file: {file_name}")
            if mismatches:
                print("Type mismatches:", mismatches)
            if extra:
                print("Extra columns:", extra)
            if missing:
                print("Missing columns:", missing)
        else:
            print(f"{file_name} passed schema validation.")