### DataFrame

In [1]:
import pandas as pd
import numpy as np

df = pd.DataFrame( {
    'account_number': [1, 1, -3.0, 'four', '' ],
    'user': ['Alice', 2, 'Charlie', np.nan, 'Ethan'],
    'DOB': ['2020-01-01', '2021-12-31', 'June 5, 2024', '2021-12-31', '!'],  
    'email': ['alice@mail.com', '', 'charlie@mail.com', 'david@example.com', 'notamaile@com'],
    'phone_number': [101212121, '', 12121122, 7777777, 712232132],
    'amount': [100.00, 750.01, 121.00, 145.00, 180.00]
})

expected_types = {
    'account_number': 'int',
    'user': 'str',
    'DOB': 'date',
    'email': 'email',
    'phone_number': 'phone',
    'amount': 'float'
}

In [2]:
df

Unnamed: 0,account_number,user,DOB,email,phone_number,amount
0,1,Alice,2020-01-01,alice@mail.com,101212121.0,100.0
1,1,2,2021-12-31,,,750.01
2,-3.0,Charlie,"June 5, 2024",charlie@mail.com,12121122.0,121.0
3,four,,2021-12-31,david@example.com,7777777.0,145.0
4,,Ethan,!,notamaile@com,712232132.0,180.0


### Data Integrity Summary 

- With Expected Type Config Table

In [2]:
import pandas as pd
import re

class DataIntegritySummary:
    
    def __init__(self, df):
        self.df = df.copy()
        self.replace_disguised_missing()
        
    def replace_disguised_missing(self):
        self.df.replace(to_replace=['', ' ', 'N/A'], value=('missing'), inplace = True)

    def valid_email(self, email):
        regex = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
        return bool(re.fullmatch(regex, email))  

    def valid_phone(self, phone):
        regex = r'^[0-9-]{7,12}'
        return bool(re.fullmatch(regex, str(phone)))  
    
    def null_check(self, column):
        null_count = self.df[column].isna().sum() + self.df[column].apply(lambda x: 1 if x == 'missing' else 0).sum()
        is_null = 1 if null_count else 0
        return null_count, is_null

    def distinct_check(self, column):
        distinct_count = self.df[column].nunique()
        is_distinct = 1 if distinct_count == self.df.shape[0] else 0
        return distinct_count, is_distinct

    def negative_check(self, column):
        return self.df[column].apply(lambda x: isinstance(x, (int, float)) and x < 0).sum()
        
    def integer_check(self, column):
        invalid_int_count = self.df[column].apply(lambda x: not isinstance(x, int)).sum()
        is_int = 1 if not invalid_int_count else 0
        return invalid_int_count, is_int

    def string_check(self, column):
        invalid_str_count = self.df[column].apply(lambda x: not isinstance(x, str)).sum()
        is_str = 1 if not invalid_str_count else 0
        return invalid_str_count, is_str
    
    def float_check(self, column):
        invalid_float_count = self.df[column].apply(lambda x: not isinstance(x, float)).sum()
        is_float = 1 if not invalid_float_count else 0
        return invalid_float_count, is_float

    def date_check(self, column):
        invalid_date_count = pd.to_datetime(self.df[column], format='%Y-%m-%d', errors = 'coerce').isna().sum()
        is_date = 1 if invalid_date_count==0 else 0
        return invalid_date_count, is_date
    
    def email_check(self, column):
        invalid_email_count = self.df[column].apply(lambda x: not self.valid_email(x) if isinstance(x, str) else True).sum()
        is_email = 1 if not invalid_email_count else 0
        return invalid_email_count, is_email

    def phone_check(self, column):
        invalid_phone_count = self.df[column].apply(lambda x: not self.valid_phone(x) if isinstance(x, int) else True).sum()
        is_phone = 1 if not invalid_phone_count else 0
        return invalid_phone_count, is_phone

    def integrity_check(self, column, expected_type):

        null_result = self.null_check(column)
        distinct_result = self.distinct_check(column)
        integer_result = self.integer_check(column) if expected_type == 'int' else (0,0)
        string_result = self.string_check(column) if expected_type == 'str' else (0,0)
        float_result = self.float_check(column) if expected_type == 'float' else (0,0)
        date_result = self.date_check(column) if expected_type == 'date' else (0,0)
        email_result = self.email_check(column) if expected_type == 'email' else (0,0)
        phone_result = self.phone_check(column) if expected_type == 'phone' else (0,0)
        
        results = {
            'null_count': null_result[0],
            'distinct_count': distinct_result[0],
            'negative_count': self.negative_check(column),   
            'invalid_int_count': integer_result[0],
            'invalid_str_count': string_result[0],
            'invalid_float_count': float_result[0],
            'invalid_date_count': date_result[0],
            'invalid_email_count': email_result[0],
            'invalid_phone_count': phone_result[0],
            'has_null': null_result[1],
            'is_distinct': distinct_result[1],
            'is_int': integer_result[1],
            'is_str': string_result[1],
            'is_float': float_result[1],
            'is_date': date_result[1],
            'is_email': email_result[1],
            'is_phone': phone_result[1]
        }
        return pd.DataFrame(results, index=[column])

print(f"The shape of df is {df.shape}")
print(f"The number of duplicated rows is {df.duplicated().sum()}")

integrity_checker = DataIntegritySummary(df)
summary_df = pd.concat([integrity_checker.integrity_check(column, expected_types[column]) for column in df.columns])
summary_df

The shape of df is (5, 6)
The number of duplicated rows is 0


Unnamed: 0,null_count,distinct_count,negative_count,invalid_int_count,invalid_str_count,invalid_float_count,invalid_date_count,invalid_email_count,invalid_phone_count,has_null,is_distinct,is_int,is_str,is_float,is_date,is_email,is_phone
account_number,1,4,1,3,0,0,0,0,0,1,0,0,0,0,0,0,0
user,1,4,0,0,2,0,0,0,0,1,0,0,0,0,0,0,0
DOB,0,4,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0
email,1,5,0,0,0,0,0,2,0,1,1,0,0,0,0,0,0
phone_number,1,5,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0
amount,0,5,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0


### Data Integrity Summary - More concise code

- With Expected Type Config Table

In [3]:
import pandas as pd
import re

class DataIntegritySummary:
    
    def __init__(self, df):
        self.df = df.copy()
        self.replace_disguised_missing()

    def integrity_check(self, column, expected_type):
        null_count = self.df[column].isna().sum() + self.df[column].apply(lambda x: 1 if x == 'missing' else 0).sum()
        distinct_count = self.df[column].nunique()
        negative_count = self.df[column].apply(lambda x: isinstance(x, (int, float)) and x < 0).sum()
        invalid_int_count = self.df[column].apply(lambda x: not isinstance(x, int)).sum() if expected_type == 'int' else 0
        invalid_str_count = self.df[column].apply(lambda x: not isinstance(x, str)).sum() if expected_type == 'str' else 0
        invalid_float_count = self.df[column].apply(lambda x: not isinstance(x, float)).sum() if expected_type == 'float' else 0
        invalid_date_count = pd.to_datetime(self.df[column], format='%Y-%m-%d', errors='coerce').isna().sum() if expected_type == 'date' else 0
        invalid_email_count = self.df[column].apply(lambda x: not self.valid_email(x) if isinstance(x, str) else True).sum() if expected_type == 'email' else 0
        invalid_phone_count = self.df[column].apply(lambda x: not self.valid_phone(x) if isinstance(x, int) else True).sum() if expected_type == 'phone' else 0

        invalid_counts = {
            'int': invalid_int_count,
            'str': invalid_str_count,
            'float': invalid_float_count,
            'date': invalid_date_count,
            'email': invalid_email_count,
            'phone': invalid_phone_count
        }

        invalid_data_points = invalid_counts.get(expected_type, 0)

        results = {
            'data_type': expected_type,    # self.df.dtypes[column]
            'null_count': null_count,
            'distinct_count': distinct_count,
            'negative_count': negative_count,
            'invalid_data_points': invalid_data_points,
            'has_null': 1 if null_count else 0,
            'is_distinct': 1 if distinct_count == self.df.shape[0] else 0,
            'is_int': 1 if not invalid_int_count and expected_type == 'int' else 0,
            'is_str': 1 if not invalid_str_count and expected_type == 'str'else 0,
            'is_float': 1 if not invalid_float_count and expected_type == 'float'else 0,
            'is_date': 1 if not invalid_date_count and expected_type == 'date'else 0,
            'is_email': 1 if not invalid_email_count and expected_type == 'email'else 0,
            'is_phone': 1 if not invalid_phone_count  and expected_type == 'phone'else 0

        }
        return pd.DataFrame(results, index=[column])

    def valid_email(self, email):
        regex = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
        return bool(re.fullmatch(regex, email))  

    def valid_phone(self, phone):
        regex = r'^[0-9-]{7,12}'
        return bool(re.fullmatch(regex, str(phone)))   

    def replace_disguised_missing(self):
        self.df.replace(to_replace=['', ' ', 'N/A'], value='missing', inplace=True)

print(f"The shape of df is {df.shape}")
print(f"The number of duplicated rows is {df.duplicated().sum()}")

integrity_checker = DataIntegritySummary(df)
summary_df = pd.concat([integrity_checker.integrity_check(column, expected_types[column]) for column in df.columns])
summary_df


The shape of df is (5, 6)
The number of duplicated rows is 0


Unnamed: 0,data_type,null_count,distinct_count,negative_count,invalid_data_points,has_null,is_distinct,is_int,is_str,is_float,is_date,is_email,is_phone
account_number,int,1,4,1,3,1,0,0,0,0,0,0,0
user,str,1,4,0,2,1,0,0,0,0,0,0,0
DOB,date,0,4,0,2,0,0,0,0,0,0,0,0
email,email,1,5,0,2,1,1,0,0,0,0,0,0
phone_number,phone,1,5,0,1,1,1,0,0,0,0,0,0
amount,float,0,5,0,0,0,1,0,0,1,0,0,0


### DataFrame *from* SQL 


In [4]:

from mysql_connection import table_df

df = table_df('customer', 'rw_transaction_data')

expected_types = {
    'txn_id': 'int', 
    'last_modified_date': 'date', 
    'last_modified_date_bs':'date', 
    'created_date':'date',
    'amount':'float', 
    'status': 'int', 
    'module_id': 'int', 
    'product_id': 'int', 
    'product_type_id': 'int',
    'payer_account_id': 'int', 
    'receiver_account_id': 'int', 
    'reward_point': 'float',
    'cash_back_amount': 'float', 
    'revenue_amount': 'float',
    'transactor_module_id': 'int',
    'time': 'date'
}

In [5]:
print(f"The shape of df is {df.shape}")
print(f"The number of duplicated rows is {df.duplicated().sum()}")

integrity_checker = DataIntegritySummary(df)
summary_df = pd.concat([integrity_checker.integrity_check(column, expected_types[column]) for column in df.columns])
summary_df

The shape of df is (11012, 16)
The number of duplicated rows is 0


Unnamed: 0,data_type,null_count,distinct_count,negative_count,invalid_data_points,has_null,is_distinct,is_int,is_str,is_float,is_date,is_email,is_phone
txn_id,int,0,11012,0,0,0,1,1,0,0,0,0,0
last_modified_date,date,0,7,0,0,0,0,0,0,0,1,0,0
last_modified_date_bs,date,0,7,0,0,0,0,0,0,0,1,0,0
created_date,date,0,7,0,0,0,0,0,0,0,1,0,0
amount,float,0,1360,0,0,0,0,0,0,1,0,0,0
status,int,0,1,0,0,0,0,1,0,0,0,0,0
module_id,int,0,2,0,0,0,0,1,0,0,0,0,0
product_id,int,0,145,0,0,0,0,1,0,0,0,0,0
product_type_id,int,0,23,0,0,0,0,1,0,0,0,0,0
payer_account_id,int,0,7,0,0,0,0,1,0,0,0,0,0


### Data Integrity Summary - More concise code

- Without Expected Type Config Table

In [1]:
import pandas as pd
import numpy as np
import re

class DataIntegritySummary:
    
    def __init__(self, df):
        self.df = df.copy()
        self.replace_disguised_missing()

    def integrity_check(self, column):
        null_count = self.df[column].isna().sum() + self.df[column].apply(lambda x: 1 if x == 'missing' else 0).sum()
        distinct_count = self.df[column].nunique()
        negative_count = self.df[column].apply(lambda x: isinstance(x, (int, float)) and x < 0).sum()
        invalid_int_count = self.df[column].apply(lambda x: not isinstance(x, int)).sum() 
        invalid_str_count = self.df[column].apply(lambda x: not isinstance(x, str)).sum() 
        invalid_float_count = self.df[column].apply(lambda x: not isinstance(x, float)).sum() 
        invalid_date_count = pd.to_datetime(self.df[column], format='%Y-%m-%d', errors='coerce').isna().sum() 
        invalid_email_count = self.df[column].apply(lambda x: not self.valid_email(x) if isinstance(x, str) else True).sum() 
        invalid_phone_count = self.df[column].apply(lambda x: not self.valid_phone(x) if isinstance(x, int) else True).sum()

        results = {
            'null_count': null_count,
            'distinct_count': distinct_count,
            'negative_count': negative_count,
            'invalid_int_count': invalid_int_count,
            'invalid_str_count': invalid_str_count,
            'invalid_float_count': invalid_float_count,
            'invalid_date_count': invalid_date_count,
            'invalid_email_count': invalid_email_count,
            'invalid_phone_count': invalid_phone_count,           
            'has_null': 1 if null_count else 0,
            'is_distinct': 1 if distinct_count == self.df.shape[0] else 0,
            'is_int': 1 if not invalid_int_count else 0,
            'is_str': 1 if not invalid_str_count else 0,
            'is_float': 1 if not invalid_float_count else 0,
            'is_date': 1 if not invalid_date_count else 0,
            'is_email': 1 if not invalid_email_count else 0,
            'is_phone': 1 if not invalid_phone_count  else 0

        }
        return pd.DataFrame(results, index=[column])

    def valid_email(self, email):
        regex = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
        return bool(re.fullmatch(regex, email))  

    def valid_phone(self, phone):
        regex = r'^[0-9-]{7,12}'
        return bool(re.fullmatch(regex, str(phone)))   

    def replace_disguised_missing(self):
        self.df.replace(to_replace=['', ' ', 'N/A'], value='missing', inplace=True)

df = pd.DataFrame( {
    'account_number': [1, 1, -3.0, 'four', '' ],
    'user': ['Alice', 2, 'Charlie', np.nan, 'Ethan'],
    'DOB': ['2020-01-01', '2021-12-31', 'June 5, 2024', '2021-12-31', '!'],  
    'email': ['alice@mail.com', '', 'charlie@mail.com', 'david@example.com', 'notamaile@com'],
    'phone_number': [101212121, '', 12121122, 7777777, 712232132],
    'amount': [100.00, 750.01, 121.00, 145.00, 180.00]
})

print(f"The shape of df is {df.shape}")
print(f"The number of duplicated rows is {df.duplicated().sum()}")

integrity_checker = DataIntegritySummary(df)
summary_df = pd.concat([integrity_checker.integrity_check(column) for column in df.columns])
summary_df

The shape of df is (5, 6)
The number of duplicated rows is 0


Unnamed: 0,null_count,distinct_count,negative_count,invalid_int_count,invalid_str_count,invalid_float_count,invalid_date_count,invalid_email_count,invalid_phone_count,has_null,is_distinct,is_int,is_str,is_float,is_date,is_email,is_phone
account_number,1,4,1,3,3,4,5,5,5,1,0,0,0,0,0,0,0
user,1,4,0,4,2,4,5,5,5,1,0,0,0,0,0,0,0
DOB,0,4,0,5,0,5,2,5,5,0,0,0,1,0,0,0,0
email,1,5,0,5,0,5,5,2,5,1,1,0,1,0,0,0,0
phone_number,1,5,0,1,4,5,5,5,1,1,1,0,0,0,0,0,0
amount,0,5,0,5,5,0,5,5,5,0,1,0,0,1,0,0,0
