## <span style="color:brown"><strong> Data Validation </strong></span>

In [None]:
# Import necessary modules
from datetime import datetime
from collections import Counter
import pandas as pd
# from perfect import task
from extract_api_data import extract_data_from_api

In [None]:
df = extract_data_from_api()

#### UTILITIES

In [None]:
# Check valid Schema
def fn_check_valid_schema(df):
    '''
    Check whether the DataFrame content contains the expected columns for the Cambridge Ploice dataset.
    Otherwise, raise an error!.
    '''
    schema_cols = ['date_time', 'id', 'type', 'subtype', 'location', 'last_updated', 'description']
    
    if Counter(df.columns) != Counter(schema_cols):
        raise ValueError('DataFrame Schema does not match with the expected schema.')
    
# Check Numeric DataType
def fn_check_numeric_id(df):
    '''
    Convert 'id' values to numeric.
    If any 'id' values are non-numeric, replace them with Nan, so they can be removed downstream in the data transformations.
    '''
    df['id'] = pd.to_numeric(df['id'], errors='coerce')

    return df

# verify Date Format
def fn_verify_datetime(df):
    '''
    Verify 'date_time' values follow ISO 8601 format (https://www.iso.org/iso-8601-date-and-time-format.html).
    Raise a ValueError if any of the 'date_time' values are invalid.
    '''
    df.apply(lambda row: datetime.fromisoformat(row['date_time']), axis=1)

# Check missing values
def fn_check_missing_values(df):
    '''
    Check whether there are any missing values in columns that require data.
    For police logs, each incident should have a datetime, ID, incident type, and location.
    '''
    required_cols = ['date_time', 'id', 'type', 'location']

    for col in required_cols:
        if df[col].isnull().sum() > 0:
            raise ValueError(f"Missing values are present in the '{col}' attribute.")

In [None]:
### VALIDATION CHECK
def fn_validate_data(df):
    '''
    Check the data satisfies the following data quality checks:
    - schema is valid
    - IDs are numeric
    - datetime follows ISO 8601 format
    - no missing values in columns that require data
    '''
    fn_check_valid_schema(df)

    df = fn_check_numeric_id(df)

    fn_verify_datetime(df)

    fn_check_missing_values(df)

    return df