In [4]:
'''
Author: Ngawang Gurung
Date: 2024-06-05
'''

'\nAuthor: Ngawang Gurung\nDate: 2024-06-05\n'

### Import Libraries

In [5]:
import pandas as pd
import numpy as np

### Creating Synthetic Dataframe

In [6]:
df = pd.DataFrame( {
    'account_number': [1, 1, -3, 'four', '' ],
    'user': ['Alice', 2, 'Charlie', np.nan, 'Ethan'],
    'DOB': ['2020-01-01', '2021-12-31', 'June 5, 2024', '2021-12-31', '!']
})

df.head()

Unnamed: 0,account_number,user,DOB
0,1,Alice,2020-01-01
1,1,2,2021-12-31
2,-3,Charlie,"June 5, 2024"
3,four,,2021-12-31
4,,Ethan,!


### Data Integrity 

#### Null check for Entity Integrity

In [7]:
def replace_disguised_missing(df):
    df_copy = df.copy()
    df_copy.replace({'': np.nan, ' ': np.nan, 'N/A': np.nan}, inplace=True)
    return df_copy

def null_check(df, column = None):
    df = replace_disguised_missing(df)
    if column:
        null_count = df[column].isna().sum()
        result = pd.DataFrame({column: [null_count]}, index=['null_count'])
    else:
        null_count = df.isna().sum()
        result = pd.DataFrame(null_count, columns=['null_count']).transpose()
    return result

def show_missing_records(df):
    df = replace_disguised_missing(df)
    return df[df.isna().any(axis = 1)]

In [12]:
null_check(df, 'account_number')

Unnamed: 0,account_number
null_count,1


In [8]:
null_check(df)

Unnamed: 0,account_number,user,DOB
null_count,1,1,0


In [9]:
show_missing_records(df)

Unnamed: 0,account_number,user,DOB
3,four,,2021-12-31
4,,Ethan,!


#### Duplication Check for Entity Integrity

In [74]:
def duplicate_check(df, column):
    duplicate_count = df.duplicated(subset= [column]).sum()
    if duplicate_count !=0:
        print(f"\nNo. of duplicated records in column {column} is {duplicate_count}")
    
    duplicate_records = df[df.duplicated(subset=[column], keep=False)]
    return duplicate_records

def find_duplicate_row_count(df):
    return df.duplicated().sum()

In [75]:
duplicate_check(df, 'account_number')


No. of duplicated records in column account_number is 1


Unnamed: 0,account_number,user,DOB
0,1,Alice,2020-01-01
1,1,2,2021-12-31


In [76]:
find_duplicate_row_count(df)

0

#### Negative Check

In [80]:
def negative_check(df, column):
    numeric_column = pd.to_numeric(df[column], errors='coerce')
    negative_records = df[numeric_column < 0]
    negative_count = negative_records.shape[0]
    if not negative_records.empty:
        print(f"\n{negative_count} Negative values found in column {column}")
    return negative_records

negative_check(df, 'account_number')


1 Negative values found in column account_number


Unnamed: 0,account_number,user,DOB
2,-3,Charlie,"June 5, 2024"


#### Domain Integrity

In [77]:
def type_check(df, column, data_type):
    invalid_records = pd.DataFrame()
    
    if data_type == 'int':
        invalid_records = df[df[column].apply(lambda x: not isinstance(x, int))]
        invalid_counts = invalid_records.shape[0]
        print(f"\n{invalid_counts} Non-integer values found in column {column}")
    elif data_type == 'str':
        invalid_records = df[df[column].apply(lambda x: not isinstance(x, str))]
        invalid_counts = invalid_records.shape[0]
        print(f"\n{invalid_counts} Non-string values found in column {column}")
    elif data_type == 'datetime':
        try: 
            df[column] = pd.to_datetime(df[column], errors='raise')
        except ValueError:
            invalid_records = df[pd.to_datetime(df[column], errors='coerce').isna()]
            invalid_counts = invalid_records.shape[0]
            print(f"\n{invalid_counts} Invalid date values found in column {column}")
    else:
        print("Invalid data type provided. Enter either, 'int', 'str' or 'date'.")
        return 
        
    return invalid_records

### Check in *df*

In [79]:
# Domain Constraints
expected_types = {
    'account_number': 'int' ,
    'user': 'str',
    'DOB': 'datetime'
}

for column in df.columns:
    if column in expected_types:
        print(f"{type_check(df, column, expected_types[column])}\n")


2 Non-integer values found in column account_number
  account_number   user         DOB
3           four    NaN  2021-12-31
4                 Ethan           !


2 Non-string values found in column user
  account_number user         DOB
1              1    2  2021-12-31
3           four  NaN  2021-12-31


2 Invalid date values found in column DOB
  account_number     user           DOB
2             -3  Charlie  June 5, 2024
4                   Ethan             !

