In [1]:
import json

In [21]:
file_path = 'Sample_data.json'

In [23]:
with open(file_path, 'r') as json_file:
    data = json.load(json_file)

In [25]:
print(json.dumps(data, indent=4))

[
    {
        "id": 1,
        "name": "alice johnson",
        "age": 30,
        "email": "alice.johnson@example.com",
        "address": {
            "street": "123 Oak St",
            "city": "New York",
            "state": "NY",
            "zip": "10001"
        },
        "phone_numbers": [
            {
                "type": "home",
                "number": "212-555-1234"
            },
            {
                "type": "mobile",
                "number": "917-555-4321"
            }
        ],
        "employment": {
            "job_title": "Software Engineer",
            "company": "Tech Solutions",
            "years_employed": 5
        }
    },
    {
        "id": 2,
        "name": "Bob smith",
        "age": "thirty-five",
        "email": "bob.smith@example.com",
        "address": {
            "street": "456 Elm St",
            "city": "Los Angeles",
            "state": "CA",
            "zip": "90001"
        },
        "phone_numbers": [
            

In [27]:
# Function to fix data types in a record
def fix_data_types(record):
    # Fix 'age' to be an integer if it's a string
    if isinstance(record.get('age'), str):
        try:
            record['age'] = int(record['age'])
        except ValueError:
            record['age'] = None  # Assign None if conversion fails
    
    return record

# Apply the fix to the dataset
cleaned_data = [fix_data_types(record) for record in data]
print(json.dumps(cleaned_data, indent=4))

[
    {
        "id": 1,
        "name": "alice johnson",
        "age": 30,
        "email": "alice.johnson@example.com",
        "address": {
            "street": "123 Oak St",
            "city": "New York",
            "state": "NY",
            "zip": "10001"
        },
        "phone_numbers": [
            {
                "type": "home",
                "number": "212-555-1234"
            },
            {
                "type": "mobile",
                "number": "917-555-4321"
            }
        ],
        "employment": {
            "job_title": "Software Engineer",
            "company": "Tech Solutions",
            "years_employed": 5
        }
    },
    {
        "id": 2,
        "name": "Bob smith",
        "age": null,
        "email": "bob.smith@example.com",
        "address": {
            "street": "456 Elm St",
            "city": "Los Angeles",
            "state": "CA",
            "zip": "90001"
        },
        "phone_numbers": [
            {
       

In [29]:
# Function to fill missing values with defaults
def fill_missing_values(record):
    # Fill missing 'age' with a default value of 0
    if record.get('age') is None:
        record['age'] = 0
    
    # Fill missing 'email' with a placeholder
    if not record.get('email'):
        record['email'] = "no_email@example.com"
    
    return record

# Apply missing value handling to the dataset
cleaned_data = [fill_missing_values(record) for record in cleaned_data]
print(json.dumps(cleaned_data, indent=4))

[
    {
        "id": 1,
        "name": "alice johnson",
        "age": 30,
        "email": "alice.johnson@example.com",
        "address": {
            "street": "123 Oak St",
            "city": "New York",
            "state": "NY",
            "zip": "10001"
        },
        "phone_numbers": [
            {
                "type": "home",
                "number": "212-555-1234"
            },
            {
                "type": "mobile",
                "number": "917-555-4321"
            }
        ],
        "employment": {
            "job_title": "Software Engineer",
            "company": "Tech Solutions",
            "years_employed": 5
        }
    },
    {
        "id": 2,
        "name": "Bob smith",
        "age": 0,
        "email": "bob.smith@example.com",
        "address": {
            "street": "456 Elm St",
            "city": "Los Angeles",
            "state": "CA",
            "zip": "90001"
        },
        "phone_numbers": [
            {
          

In [31]:
# Define the valid keys/schema
valid_keys = {"id", "name", "age", "email", "address", "phone_numbers", "employment", "social_media"}

# Function to remove extra fields not defined in schema
def remove_extra_fields(record):
    return {key: value for key, value in record.items() if key in valid_keys}

# Apply the function to clean the dataset
cleaned_data = [remove_extra_fields(record) for record in cleaned_data]
print(json.dumps(cleaned_data, indent=4))

[
    {
        "id": 1,
        "name": "alice johnson",
        "age": 30,
        "email": "alice.johnson@example.com",
        "address": {
            "street": "123 Oak St",
            "city": "New York",
            "state": "NY",
            "zip": "10001"
        },
        "phone_numbers": [
            {
                "type": "home",
                "number": "212-555-1234"
            },
            {
                "type": "mobile",
                "number": "917-555-4321"
            }
        ],
        "employment": {
            "job_title": "Software Engineer",
            "company": "Tech Solutions",
            "years_employed": 5
        }
    },
    {
        "id": 2,
        "name": "Bob smith",
        "age": 0,
        "email": "bob.smith@example.com",
        "address": {
            "street": "456 Elm St",
            "city": "Los Angeles",
            "state": "CA",
            "zip": "90001"
        },
        "phone_numbers": [
            {
          

In [33]:
# Function to normalize name fields (capitalize)
def normalize_name(record):
    if 'name' in record:
        record['name'] = record['name'].title()  # Capitalize each word in the name
    return record

# Apply the normalization function
cleaned_data = [normalize_name(record) for record in cleaned_data]
print(json.dumps(cleaned_data, indent=4))

[
    {
        "id": 1,
        "name": "Alice Johnson",
        "age": 30,
        "email": "alice.johnson@example.com",
        "address": {
            "street": "123 Oak St",
            "city": "New York",
            "state": "NY",
            "zip": "10001"
        },
        "phone_numbers": [
            {
                "type": "home",
                "number": "212-555-1234"
            },
            {
                "type": "mobile",
                "number": "917-555-4321"
            }
        ],
        "employment": {
            "job_title": "Software Engineer",
            "company": "Tech Solutions",
            "years_employed": 5
        }
    },
    {
        "id": 2,
        "name": "Bob Smith",
        "age": 0,
        "email": "bob.smith@example.com",
        "address": {
            "street": "456 Elm St",
            "city": "Los Angeles",
            "state": "CA",
            "zip": "90001"
        },
        "phone_numbers": [
            {
          

In [35]:
import re

# Function to validate email format
def validate_email(email):
    email_regex = r'^\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    return bool(re.match(email_regex, email))

# Function to validate age (e.g., ensure age is between 0 and 120)
def validate_age(age):
    return 0 <= age <= 120

# Function to validate the entire record
def validate_record(record):
    # Validate email
    if not validate_email(record.get("email", "")):
        print(f"Invalid email for ID {record['id']}: {record['email']}")
    
    # Validate age
    if not validate_age(record.get("age", 0)):
        print(f"Invalid age for ID {record['id']}: {record['age']}")

# Apply validation to the dataset
for record in cleaned_data:
    validate_record(record)

In [37]:
# Function to remove duplicates based on 'id'
def remove_duplicates(dataset, key):
    seen = set()
    unique_records = []
    for record in dataset:
        record_key = record.get(key)
        if record_key not in seen:
            unique_records.append(record)
            seen.add(record_key)
    return unique_records

# Apply the duplicate removal function
cleaned_data = remove_duplicates(cleaned_data, 'id')
print(f"Data after removing duplicates: {json.dumps(cleaned_data, indent=4)}")

Data after removing duplicates: [
    {
        "id": 1,
        "name": "Alice Johnson",
        "age": 30,
        "email": "alice.johnson@example.com",
        "address": {
            "street": "123 Oak St",
            "city": "New York",
            "state": "NY",
            "zip": "10001"
        },
        "phone_numbers": [
            {
                "type": "home",
                "number": "212-555-1234"
            },
            {
                "type": "mobile",
                "number": "917-555-4321"
            }
        ],
        "employment": {
            "job_title": "Software Engineer",
            "company": "Tech Solutions",
            "years_employed": 5
        }
    },
    {
        "id": 2,
        "name": "Bob Smith",
        "age": 0,
        "email": "bob.smith@example.com",
        "address": {
            "street": "456 Elm St",
            "city": "Los Angeles",
            "state": "CA",
            "zip": "90001"
        },
        "phone_numb

In [39]:
# Export the cleaned data to a JSON file
with open('cleaned_data.json', 'w') as f:
    json.dump(cleaned_data, f, indent=4)

print("Cleaned data has been saved to 'cleaned_data.json'.")

Cleaned data has been saved to 'cleaned_data.json'.
