In [5]:
import pandas as pd
import numpy as np
import re

df = pd.DataFrame( {
    'account_number': [1, 1, -3.0, 'four', '' ],
    'user': ['Alice', 2, 'Charlie', np.nan, 'Ethan'],
    'DOB': ['2020-01-01', '2021-12-31', 'June 5, 2024', '2021-12-31', '!'],  
    'email': ['alice@mail.com', '', 'charlie@mail.com', 'david@example.com', 'notamaile@com'],
    'phone_number': [101212121, '', 12121122, 7777777, 712232132]
})

class DataIntegritySummary:
    
    def __init__(self, df):
        self.df = df.copy()
        self.replace_disguised_missing()
        
    def replace_disguised_missing(self):
        self.df.replace(to_replace=['', ' ', 'N/A'], value=('missing'), inplace = True)

    def null_check(self, column):
        null_count = self.df[column].isna().sum() + self.df[column].apply(lambda x: 1 if x == 'missing' else 0).sum()
        is_null = 1 if null_count else 0
        return null_count, is_null

    def duplicate_check(self, column):
        duplicate_count = self.df.duplicated(subset=[column]).sum()
        is_duplicate = 1 if duplicate_count else 0
        return duplicate_count, is_duplicate

    def negative_check(self, column):
        return self.df[column].apply(lambda x: 1 if isinstance(x, (int, float)) and x < 0 else 0).sum()
        
    def integer_check(self, column):
        invalid_int_count = self.df[column].apply(lambda x: not isinstance(x, int)).sum()
        is_int = 1 if invalid_int_count==0 else 0
        return invalid_int_count, is_int

    def string_check(self, column):
        invalid_str_count = self.df[column].apply(lambda x: not isinstance(x, str)).sum()
        is_str = 1 if invalid_str_count ==0 else 0
        return invalid_str_count, is_str
    
    def float_check(self, column):
        invalid_float_count = self.df[column].apply(lambda x: not isinstance(x, float)).sum()
        is_float = 1 if invalid_float_count ==0 else 0
        return invalid_float_count, is_float

    def date_check(self, column):
        invalid_date_count = pd.to_datetime(self.df[column], format='%Y-%m-%d', errors = 'coerce').isna().sum()
        is_date = 1 if invalid_date_count==0 else 0
        return invalid_date_count, is_date
    
    def email_check(self, column):
        email_count = self.df[column].apply(lambda x: self.valid_email(x) if isinstance(x, str) else False).sum()
        is_email = 1 if email_count == self.df.shape[0] else 0
        return email_count, is_email

    def valid_email(self, email):
        regex = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
        return bool(re.fullmatch(regex, email))  

    def phone_check(self, column):
        phone_count = self.df[column].apply(lambda x: self.valid_phone(x) if isinstance(x, int) else False).sum()
        is_phone = 1 if phone_count == self.df.shape[0] else 0
        return phone_count, is_phone

    def valid_phone(self, phone):
        regex = r'^[0-9-]{7,12}'
        return bool(re.fullmatch(regex, str(phone)))   

    def integrity_check_df(self, column):
        null_result = self.null_check(column)
        duplicate_result = self.duplicate_check(column)
        integer_result = self.integer_check(column)
        string_result = self.string_check(column)
        float_result = self.float_check(column)
        date_result = self.date_check(column)
        email_result = self.email_check(column)
        phone_result = self.phone_check(column)
        
        results = {
            'null_count': null_result[0],
            'duplicate_count': duplicate_result[0],
            'negative_count': self.negative_check(column),   
            'invalid_int_count': integer_result[0],
            'invalid_str_count': string_result[0],
            'invalid_float_count': float_result[0],
            'invalid_date_count': date_result[0],
            'email_count': email_result[0],
            'phone_count': phone_result[0],
            'has_null': null_result[1],
            'has_duplicate': duplicate_result[1],
            'is_int': integer_result[1],
            'is_str': string_result[1],
            'is_float': float_result[1],
            'is_date': date_result[1],
            'is_email': email_result[1],
            'is_phone': phone_result[1]
        }
        return pd.DataFrame(results, index=[column])


integrity_checker = DataIntegritySummary(df)

summary_df = pd.concat([integrity_checker.integrity_check_df(column) for column in df.columns])
summary_df


Unnamed: 0,null_count,duplicate_count,negative_count,invalid_int_count,invalid_str_count,invalid_float_count,invalid_date_count,email_count,phone_count,has_null,has_duplicate,is_int,is_str,is_float,is_date,is_email,is_phone
account_number,1,1,1,3,3,4,5,0,0,1,1,0,0,0,0,0,0
user,1,0,0,4,2,4,5,0,0,1,0,0,0,0,0,0,0
DOB,0,1,0,5,0,5,2,0,0,0,1,0,1,0,0,0,0
email,1,0,0,5,0,5,5,3,0,1,0,0,1,0,0,0,0
phone_number,1,0,0,1,4,5,5,0,4,1,0,0,0,0,0,0,0


### Playground

In [6]:
import pandas as pd
import numpy as np
import re

class DataIntegritySummary:
    
    def __init__(self, df):
        self.df = df.copy()
        self.replace_disguised_missing()

    def integrity_check(self, column):
        null_count = self.df[column].isna().sum() + self.df[column].apply(lambda x: 1 if x == 'missing' else 0).sum()
        duplicate_count = self.df.duplicated(subset=[column]).sum()
        negative_count = self.df[column].apply(lambda x: 1 if isinstance(x, (int, float)) and x < 0 else 0).sum()
        invalid_int_count = self.df[column].apply(lambda x: not isinstance(x, int)).sum()
        invalid_str_count = self.df[column].apply(lambda x: not isinstance(x, str)).sum()
        invalid_float_count = self.df[column].apply(lambda x: not isinstance(x, float)).sum()
        invalid_date_count = pd.to_datetime(self.df[column], format='%Y-%m-%d', errors='coerce').isna().sum()
        email_count = self.df[column].apply(lambda x: self.valid_email(x) if isinstance(x, str) else False).sum()
        phone_count = self.df[column].apply(lambda x: self.valid_phone(x) if isinstance(x, int) else False).sum() 
        
        results = {
            'null_count': null_count,
            'duplicate_count': duplicate_count,
            'negative_count': negative_count,
            'invalid_int_count': invalid_int_count,
            'invalid_str_count': invalid_str_count,
            'invalid_float_count': invalid_float_count,
            'invalid_date_count': invalid_date_count,
            'email_count': email_count,
            'phone_count': phone_count,
            'has_null': 1 if null_count else 0,
            'has_duplicate': 1 if duplicate_count else 0,
            'is_int': 1 if invalid_int_count == 0 else 0,
            'is_str': 1 if invalid_str_count == 0 else 0,
            'is_float': 1 if invalid_float_count == 0 else 0,
            'is_date': 1 if invalid_date_count == 0 else 0,
            'is_email': 1 if email_count == self.df.shape[0] else 0,
            'is_phone': 1 if phone_count == self.df.shape[0] else 0
        }
        return pd.DataFrame(results, index=[column])

    def valid_email(self, email):
        regex = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
        return bool(re.fullmatch(regex, email))  

    def valid_phone(self, phone):
        regex = r'^[0-9-]{7,12}'
        return bool(re.fullmatch(regex, str(phone)))   

    def replace_disguised_missing(self):
        self.df.replace(to_replace=['', ' ', 'N/A'], value='missing', inplace=True)

df = pd.DataFrame( {
    'account_number': [1, 1, -3.0, 'four', '' ],
    'user': ['Alice', 2, 'Charlie', np.nan, 'Ethan'],
    'DOB': ['2020-01-01', '2021-12-31', 'June 5, 2024', '2021-12-31', '!'],  
    'email': ['alice@mail.com', '', 'charlie@mail.com', 'david@example.com', 'notamaile@com'],
    'phone_number': [101212121, '', 12121122, 7777777, 712232132]
})

integrity_checker = DataIntegritySummary(df)
summary_df = pd.concat([integrity_checker.integrity_check(column) for column in df.columns])
summary_df


Unnamed: 0,null_count,duplicate_count,negative_count,invalid_int_count,invalid_str_count,invalid_float_count,invalid_date_count,email_count,phone_count,has_null,has_duplicate,is_int,is_str,is_float,is_date,is_email,is_phone
account_number,1,1,1,3,3,4,5,0,0,1,1,0,0,0,0,0,0
user,1,0,0,4,2,4,5,0,0,1,0,0,0,0,0,0,0
DOB,0,1,0,5,0,5,2,0,0,0,1,0,1,0,0,0,0
email,1,0,0,5,0,5,5,3,0,1,0,0,1,0,0,0,0
phone_number,1,0,0,1,4,5,5,0,4,1,0,0,0,0,0,0,0
