### Task 1: Detecting Missing Values during Data Ingestion
**Description**: You have a CSV file with missing values in some columns. Write a Python script to detect and report missing values during the ingestion process.

**Steps**:
1. Load data
2. Check for missing values
3. Report missing values

In [6]:
import os
import json
import pandas as pd

# ----------------------------
# Helper Functions
# ----------------------------

def load_csv_with_checks(file_path):
    """
    Safely load a CSV file with error handling for file existence and emptiness.
    """
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    
    try:
        df = pd.read_csv(file_path)
        if df.empty:
            raise ValueError("CSV file is empty")
        return df
    except Exception as e:
        print(f"Failed to load CSV file: {e}")
        raise

def load_json_with_checks(file_path):
    """
    Safely load a JSON file with error handling for file existence and emptiness.
    """
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
        if not data:
            raise ValueError("JSON file is empty")
        return data
    except Exception as e:
        print(f"Failed to load JSON file: {e}")
        raise

def check_field_type(value, expected_type):
    """
    Check if value matches expected type.
    Allow int when float is expected.
    """
    if isinstance(value, expected_type):
        return True
    if expected_type == float and isinstance(value, int):
        return True
    return False

def validate_schema_completeness(schema, sample_record):
    """
    Check if all fields in sample_record exist in schema.
    """
    missing_fields = set(sample_record.keys()) - set(schema.keys())
    if missing_fields:
        print(f"Warning: Schema missing fields: {missing_fields}")

# ----------------------------
# Task 1: Detect Missing Values
# ----------------------------

def detect_missing_values(file_path):
    """
    Load CSV and detect missing values with robust error handling.
    """
    try:
        df = load_csv_with_checks(file_path)
    except Exception as e:
        print(f"Cannot proceed with missing values detection: {e}")
        return
    
    missing_counts = df.isnull().sum()
    total_missing = missing_counts.sum()

    if total_missing == 0:
        print("No missing values detected in the dataset.")
    else:
        print(f"Total missing values in dataset: {total_missing}")
        print("Missing values per column:")
        print(missing_counts[missing_counts > 0])

# ----------------------------
# Task 2: Validate Data Types in JSON
# ----------------------------

def validate_data_types(json_file, schema):
    """
    Validate data types in JSON records against expected schema with error handling.
    """
    try:
        data = load_json_with_checks(json_file)
    except Exception as e:
        print(f"Cannot proceed with data type validation: {e}")
        return
    
    if not isinstance(data, list):
        print("Error: JSON root element should be a list of records.")
        return
    
    # Check schema completeness using first record as sample
    validate_schema_completeness(schema, data[0])
    
    errors = []
    for i, record in enumerate(data):
        for field, expected_type in schema.items():
            if field not in record:
                errors.append(f"Record {i}: Missing field '{field}'")
            else:
                if not check_field_type(record[field], expected_type):
                    errors.append(
                        f"Record {i}: Field '{field}' expected {expected_type.__name__}, "
                        f"got {type(record[field]).__name__}"
                    )
    
    if errors:
        print("Data type validation errors found:")
        for error in errors:
            print(error)
    else:
        print("All records match the expected schema.")

# ----------------------------
# Task 3: Remove Duplicate Records in CSV
# ----------------------------

def remove_duplicates(file_path):
    """
    Find and remove duplicate records from CSV with error handling.
    """
    try:
        df = load_csv_with_checks(file_path)
    except Exception as e:
        print(f"Cannot proceed with duplicate removal: {e}")
        return
    
    duplicates = df[df.duplicated()]
    print(f"Number of duplicate records found: {len(duplicates)}")
    
    if len(duplicates) > 0:
        print("Duplicate records:")
        print(duplicates)
    
    df_cleaned = df.drop_duplicates()
    print(f"Number of records after removing duplicates: {len(df_cleaned)}")
    
    cleaned_file_path = 'cleaned_' + os.path.basename(file_path)
    df_cleaned.to_csv(cleaned_file_path, index=False)
    print(f"Cleaned data saved to {cleaned_file_path}")

# ----------------------------
# Example usage
# ----------------------------

if __name__ == "__main__":
    # Task 1 example:
    print("=== Detect Missing Values ===")
    detect_missing_values('your_data.csv')
    
    # Task 2 example:
    print("\n=== Validate Data Types ===")
    expected_schema = {
        "id": int,
        "name": str,
        "age": int,
        "email": str,
        "is_active": bool,
        "balance": float
    }
    validate_data_types('data.json', expected_schema)
    
    # Task 3 example:
    print("\n=== Remove Duplicates ===")
    remove_duplicates('your_data.csv')


=== Detect Missing Values ===
Total missing values in dataset: 3
Missing values per column:
name     1
age      1
email    1
dtype: int64

=== Validate Data Types ===
Data type validation errors found:
Record 1: Field 'age' expected int, got str

=== Remove Duplicates ===
Number of duplicate records found: 0
Number of records after removing duplicates: 5
Cleaned data saved to cleaned_your_data.csv


### Task 2: Validate Data Types during Extraction
**Description**: You have a JSON file that should have specific data types for each field. Write a script to validate if the data types match the expected schema.

**Steps**:
1. Define expected schema
2. Validate data types

### Task 3: Remove Duplicate Records in Data
**Description**: You have a dataset with duplicate entries. Write a Python script to find and remove duplicate records using Pandas.

**Steps**:
1. Find duplicate records
2. Remove duplicates
3. Report results