In [1]:
'''
Author: Ngawang Gurung
Date: 2024-06-05
'''

'\nAuthor: Ngawang Gurung\nDate: 2024-06-05\n'

### Import Libraries

In [2]:
import pandas as pd
import numpy as np

### Creating Synthetic Dataframe

In [3]:
df = pd.DataFrame( {
    'account_number': [1, 1, -3, 'four', '' ],
    'user': ['Alice', 2, 'Charlie', np.nan, 'Ethan'],
    'DOB': ['2020-01-01', '2021-12-31', 'June 5, 2024', '2021-12-31', '!']
})

df.head()

Unnamed: 0,account_number,user,DOB
0,1,Alice,2020-01-01
1,1,2,2021-12-31
2,-3,Charlie,"June 5, 2024"
3,four,,2021-12-31
4,,Ethan,!


### Data Integrity 

#### Null check for Entity Integrity

In [4]:
def null_check(df, column = None):
    df = replace_disguised_missing(df)
    if column:
        null_count = df[column].isna().sum()
        null_records = df[df[column].isna()]
    else:
        null_count = df.isna().sum().sum()
        null_records = df[df.isna().any(axis=1)]

    if null_count !=0:
        if column:
            print(f"\n {null_count} null values found in column {column}\n")
        else:
            print(f"\n{null_count} null records found in the dataframe\n")

    return null_records

def replace_disguised_missing(df):
    df_copy = df.copy()
    df_copy.replace({'': np.nan, ' ': np.nan, 'N/A': np.nan}, inplace=True)
    return df_copy

In [5]:
null_check(df, 'account_number')


 1 null values found in column account_number



Unnamed: 0,account_number,user,DOB
4,,Ethan,!


In [6]:
null_check(df)


2 null records found in the dataframe



Unnamed: 0,account_number,user,DOB
3,four,,2021-12-31
4,,Ethan,!


#### Duplication Check for Entity Integrity

In [7]:
def duplicate_check(df, column = None):
    if column:
        duplicate_count = df.duplicated(subset=[column]).sum()
        duplicate_records = df[df.duplicated(subset=[column], keep=False)]
    else:
        duplicate_count = df.duplicated().sum()
        duplicate_records = df[df.duplicated(keep=False)]

    if duplicate_count != 0:
        if column:
            print(f"\n{duplicate_count} duplicate values found in column '{column}'\n")
        else:
            print(f"\n{duplicate_count} duplicate records found in the dataframe\n") 

    return duplicate_records

In [8]:
duplicate_check(df, 'account_number')


1 duplicate values found in column 'account_number'



Unnamed: 0,account_number,user,DOB
0,1,Alice,2020-01-01
1,1,2,2021-12-31


In [9]:
duplicate_check(df)

Unnamed: 0,account_number,user,DOB


#### Negative Check

In [10]:
def negative_check(df, column):
    numeric_column = pd.to_numeric(df[column], errors='coerce')
    negative_records = df[numeric_column < 0]
    negative_count = negative_records.shape[0]
    if not negative_records.empty:
        print(f"\n{negative_count} Negative values found in column {column}")
    return negative_records

negative_check(df, 'account_number')


1 Negative values found in column account_number


Unnamed: 0,account_number,user,DOB
2,-3,Charlie,"June 5, 2024"


#### Domain Integrity

In [11]:
def type_check(df, column, data_type):
    invalid_records = pd.DataFrame()
    invalid_counts = 0

    if data_type == 'int':
        invalid_records = df[df[column].apply(lambda x: not isinstance(x, int))]
        invalid_counts = invalid_records.shape[0]
    elif data_type == 'str':
        invalid_records = df[df[column].apply(lambda x: not isinstance(x, str))]
        invalid_counts = invalid_records.shape[0]
    elif data_type == 'datetime':
        try:
            df[column] = pd.to_datetime(df[column], errors='raise')
        except ValueError:
            invalid_records = df[pd.to_datetime(df[column], errors='coerce').isna()]
            invalid_counts = invalid_records.shape[0]
    else:
        print("\nInvalid data type provided. Enter either 'int', 'str', or 'datetime'.")
        return None
    
    if invalid_counts:
        print(f"\n{invalid_counts} invalid '{data_type}' values found in column '{column}'\n")
    
    return invalid_records
    

### Check in *df*

In [12]:
# Domain Constraints
expected_types = {
    'account_number': 'int' ,
    'user': 'str',
    'DOB': 'datetime'
}

for column in df.columns:
    if column in expected_types:
        print(f"{type_check(df, column, expected_types[column])}\n")


2 invalid 'int' values found in column 'account_number'

  account_number   user         DOB
3           four    NaN  2021-12-31
4                 Ethan           !


2 invalid 'str' values found in column 'user'

  account_number user         DOB
1              1    2  2021-12-31
3           four  NaN  2021-12-31


2 invalid 'datetime' values found in column 'DOB'

  account_number     user           DOB
2             -3  Charlie  June 5, 2024
4                   Ethan             !

