## Check Accuracy & Completeness

**Objective**: Learn to assess data quality by checking for accuracy and completeness using Python.

For this, you will use a sample dataset students.csv that contains the following
columns: ID , Name , Age , Grade , Email .

**Steps**:
1. Check Accuracy
    - Verify Numerical Data Accuracy
    - Validate Email Format
    - Integer Accuracy Check for Age
2. Check Completeness
    - Identify Missing Values
    - Rows with Missing Data
    - Column Specific Missing Value Check

In [3]:
import pandas as pd
import re

# Load the dataset
df = pd.read_csv('students.csv')

# Print column names to check for any hidden issues
print("Columns in the DataFrame:", df.columns)

# Remove any leading or trailing spaces from column names
df.columns = df.columns.str.strip()

# Step 1: Check Accuracy

# 1.1 Verify Numerical Data Accuracy (Age and Grade should be within reasonable ranges)
def check_numerical_accuracy(df, column, min_value, max_value):
    if column not in df.columns:
        raise KeyError(f"Column '{column}' not found in DataFrame.")
    return df[(df[column] < min_value) | (df[column] > max_value)]

# 1.2 Validate Email Format using regex
def validate_email_format(df, column):
    if column not in df.columns:
        raise KeyError(f"Column '{column}' not found in DataFrame.")
    email_pattern = r"^[\w\.-]+@[\w\.-]+\.\w+$"
    return df[~df[column].astype(str).str.match(email_pattern, na=False)]

# 1.3 Integer Accuracy Check for Age (Check if Age is an integer)
def check_integer_accuracy(df, column):
    if column not in df.columns:
        raise KeyError(f"Column '{column}' not found in DataFrame.")
    return df[~df[column].apply(lambda x: isinstance(x, int))]

# Step 2: Check Completeness

# 2.1 Identify Missing Values in the whole DataFrame
def check_missing_values(df):
    return df.isnull().sum()

# 2.2 Rows with Missing Data
def rows_with_missing_data(df):
    return df[df.isnull().any(axis=1)]

# 2.3 Column-Specific Missing Value Check
def check_column_specific_missing(df, column):
    if column not in df.columns:
        raise KeyError(f"Column '{column}' not found in DataFrame.")
    return df[df[column].isnull()]

# Example Usage

# Checking for Age and Grade numerical accuracy (Age should be between 0 and 120, Grade should be 0-100)
age_violations = check_numerical_accuracy(df, 'Age', 0, 120)
grade_violations = check_numerical_accuracy(df, 'Grade', 0, 100)

# Validate email format
invalid_emails = validate_email_format(df, 'Email')

# Check for integer accuracy in Age
invalid_ages = check_integer_accuracy(df, 'Age')

# Check for missing values
missing_values = check_missing_values(df)

# Rows with missing data
rows_with_missing = rows_with_missing_data(df)

# Column-specific missing data (example: check missing emails)
missing_emails = check_column_specific_missing(df, 'Email')

# Display results
print(f"Age Violations: {age_violations}")
print(f"Grade Violations: {grade_violations}")
print(f"Invalid Emails: {invalid_emails}")
print(f"Invalid Ages (non-integer): {invalid_ages}")
print(f"Missing Values per Column:\n{missing_values}")
print(f"Rows with Missing Data:\n{rows_with_missing}")
print(f"Rows with Missing Emails:\n{missing_emails}")


Columns in the DataFrame: Index(['Age Violations: Empty DataFrame'], dtype='object')


KeyError: "Column 'Age' not found in DataFrame."