In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Load the Data
def load_data(file_path):
    """
    Load data into a pandas DataFrame.
    """
    df = pd.read_csv(file_path)
    return df

# Step 2: Measure Completeness (Missing Data)
def check_completeness(df):
    """
    Check for missing data and return percentage of missing data for each column.
    """
    missing_data = df.isnull().sum()
    missing_percentage = (missing_data / len(df)) * 100
    return missing_percentage

# Step 3: Check for Duplicates (Uniqueness)
def check_duplicates(df):
    """
    Check for duplicate rows and return the number of duplicates.
    """
    duplicate_rows = df[df.duplicated()]
    return len(duplicate_rows)

# Step 4: Check for Consistency (Example: Valid Email)
def check_validity(df, column_name, regex_pattern):
    """
    Check for validity of data in a specific column using a regex pattern.
    """
    valid_values = df[column_name].str.match(regex_pattern)
    invalid_values = df[~valid_values]
    return invalid_values

# Step 5: Check for Validity (Example: Age Validation)
def check_valid_range(df, column_name, min_value, max_value):
    """
    Check if numeric values in a column are within a valid range.
    """
    invalid_data = df[(df[column_name] < min_value) | (df[column_name] > max_value)]
    return invalid_data

# Step 6: Check for Timeliness (Example: Outdated Dates)
def check_timeliness(df, date_column):
    """
    Check if the dates in a column are outdated (compared to current date).
    """
    df[date_column] = pd.to_datetime(df[date_column])
    current_date = pd.to_datetime('today')
    outdated_data = df[df[date_column] < current_date]
    return outdated_data

# Step 7: Visualizing Missing Data
def visualize_missing_data(df):
    """
    Visualize missing data using a heatmap.
    """
    sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
    plt.show()

# Step 8: Generate a Data Quality Report
def generate_data_quality_report(df):
    """
    Generate a data quality report with multiple checks.
    """
    quality_report = {}

    # Completeness
    quality_report['missing_data'] = check_completeness(df)

    # Duplicates
    quality_report['duplicates'] = check_duplicates(df)

    # Validity check for 'email' column (example)
    email_regex = r"[^@]+@[^@]+\.[^@]+"
    quality_report['invalid_emails'] = check_validity(df, 'email', email_regex).shape[0]

    # Valid range check for 'age' column (example)
    quality_report['invalid_ages'] = check_valid_range(df, 'age', 0, 100).shape[0]

    # Timeliness check for 'date' column (example)
    quality_report['outdated_data'] = check_timeliness(df, 'date').shape[0]

    return quality_report

# Step 9: Save the Data Quality Report
def save_report(quality_report, output_file):
    """
    Save the generated data quality report to a CSV file.
    """
    quality_report_df = pd.DataFrame(quality_report, index=[0])
    quality_report_df.to_csv(output_file, index=False)

# Main function to automate the data quality measurement
def main(file_path, output_report_file):
    # Load the data
    df = load_data(file_path)

    # Generate the data quality report
    quality_report = generate_data_quality_report(df)

    # Save the report
    save_report(quality_report, output_report_file)

    # Optionally visualize missing data
    visualize_missing_data(df)

    # Print the quality report
    print("Data Quality Report:")
    print(quality_report)

# Run the script
if _name_ == "_main_":
    # Replace with your file path and desired output report path
    input_file = 'path_to_your_data.csv'
    output_report_file = 'data_quality_report.csv'

    main(input_file, output_report_file)