In [5]:
# Ques_14.ipynb - Data Quality Automation Tools (Optimized Version)

# 📌 Step 1: Import necessary libraries
import pandas as pd
import numpy as np
import re

# Optional: install great_expectations (if allowed in your environment)
# !pip install great_expectations

# 📌 Step 2: Sample DataFrame (replace with your own data)
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Alice'],
    'Age': [25, np.nan, 35, 40, -5, 25],
    'Salary': [50000, 60000, None, 70000, 55000, 50000],
    'Department': ['HR', 'Finance', 'IT', 'IT', 'Unknown', 'HR']
}
df = pd.DataFrame(data)

print("✅ Sample Data:")
display(df)

# 📌 Step 3: Error Handling Function for Missing Columns and Invalid Data Types
def check_for_missing_columns(df, expected_columns):
    missing_cols = [col for col in expected_columns if col not in df.columns]
    if missing_cols:
        raise ValueError(f"❌ Missing required columns: {', '.join(missing_cols)}")
    print("✅ All required columns are present.")

def check_for_data_types(df, expected_types):
    for col, dtype in expected_types.items():
        if not pd.api.types.is_dtype_equal(df[col].dtype, dtype):
            print(f"⚠️ Column '{col}' has incorrect type. Expected: {dtype}, Found: {df[col].dtype}")
    print("✅ Data types check completed.")

# 📌 Step 4: Handle Missing Data Upfront (Check if columns contain NaN values before applying fillna)
def handle_missing_data(df):
    for col in df.columns:
        if df[col].isnull().any():  # Only apply fillna if there are missing values
            if df[col].dtype in ['float64', 'int64']:
                fill_value = df[col].median() if df[col].dtype != 'object' else 'Unknown'
            else:
                fill_value = 'Unknown'
            df[col].fillna(fill_value, inplace=True)
    print("✅ Missing data handled efficiently.")
    return df

df = handle_missing_data(df)

# 📌 Step 5: Optimized Duplicate Check (using hashing for large DataFrames)
def check_duplicates(df):
    if len(df) > 1000:  # Example threshold for larger datasets
        # Use a more performant deduplication strategy (hashing)
        duplicates = df[df.duplicated(subset=None, keep='first')]
    else:
        duplicates = df[df.duplicated()]
    
    if not duplicates.empty:
        print("\n🔁 Duplicate Rows Found:")
        display(duplicates)
    else:
        print("\n✅ No duplicate rows found.")
        
check_duplicates(df)

# 📌 Step 6: Outlier Detection with User-defined Sensitivity (IQR method)
def detect_outliers_iqr(column, sensitivity=1.5):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - sensitivity * IQR
    upper = Q3 + sensitivity * IQR
    return column[(column < lower) | (column > upper)]

print("\n📊 Outliers in Age (Sensitivity=1.5):")
outliers_age = detect_outliers_iqr(df['Age'], sensitivity=1.5)
if not outliers_age.empty:
    display(outliers_age)
else:
    print("✅ No outliers detected in 'Age'.")

print("\n📊 Outliers in Salary (Sensitivity=1.5):")
outliers_salary = detect_outliers_iqr(df['Salary'], sensitivity=1.5)
if not outliers_salary.empty:
    display(outliers_salary)
else:
    print("✅ No outliers detected in 'Salary'.")

# 📌 Step 7: Schema Validation with Regex Validation for Strings (Name, Department)
def validate_schema(df, expected_columns):
    for col, dtype in expected_columns.items():
        # Validate data type first
        if not pd.api.types.is_dtype_equal(df[col].dtype, dtype):
            print(f"⚠️ Column '{col}' has incorrect type. Expected: {dtype}, Found: {df[col].dtype}")
        # Regex validation for strings like 'Name' and 'Department'
        if col in ['Name', 'Department']:
            if col == 'Name':
                # Validate Name should only contain alphabets and spaces
                regex = r"^[A-Za-z ]+$"
            elif col == 'Department':
                # Validate Department should be a valid name from predefined list
                regex = r"^[A-Za-z]+$"
            
            invalid_values = df[~df[col].str.match(regex, na=False)]
            if not invalid_values.empty:
                print(f"⚠️ Invalid entries in '{col}':")
                display(invalid_values[[col]])
    print("✅ Schema validation completed.")

# Define expected columns and their types
expected_columns = {
    'Name': str,
    'Age': (int, float),
    'Salary': (int, float),
    'Department': str
}

validate_schema(df, expected_columns)

# 📌 Final Summary
print("\n🎯 Data Quality Checks Completed.")


✅ Sample Data:


Unnamed: 0,Name,Age,Salary,Department
0,Alice,25.0,50000.0,HR
1,Bob,,60000.0,Finance
2,Charlie,35.0,,IT
3,David,40.0,70000.0,IT
4,Eve,-5.0,55000.0,Unknown
5,Alice,25.0,50000.0,HR


✅ Missing data handled efficiently.

🔁 Duplicate Rows Found:


Unnamed: 0,Name,Age,Salary,Department
5,Alice,25.0,50000.0,HR



📊 Outliers in Age (Sensitivity=1.5):


4   -5.0
Name: Age, dtype: float64


📊 Outliers in Salary (Sensitivity=1.5):
✅ No outliers detected in 'Salary'.
⚠️ Column 'Name' has incorrect type. Expected: <class 'str'>, Found: object
⚠️ Column 'Age' has incorrect type. Expected: (<class 'int'>, <class 'float'>), Found: float64
⚠️ Column 'Salary' has incorrect type. Expected: (<class 'int'>, <class 'float'>), Found: float64
⚠️ Column 'Department' has incorrect type. Expected: <class 'str'>, Found: object
✅ Schema validation completed.

🎯 Data Quality Checks Completed.
