In [4]:
# Ques_14.ipynb - Data Quality Automation Tools (Improved Version)

# 📌 Step 1: Import necessary libraries
import pandas as pd
import numpy as np

# Optional: install great_expectations (if allowed in your environment)
# !pip install great_expectations

# 📌 Step 2: Sample DataFrame (replace with your own data)
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Alice'],
    'Age': [25, np.nan, 35, 40, -5, 25],
    'Salary': [50000, 60000, None, 70000, 55000, 50000],
    'Department': ['HR', 'Finance', 'IT', 'IT', 'Unknown', 'HR']
}
df = pd.DataFrame(data)

print("✅ Sample Data:")
display(df)

# 📌 Step 3: Error Handling Function for Missing Columns and Invalid Data Types
def check_for_missing_columns(df, expected_columns):
    missing_cols = [col for col in expected_columns if col not in df.columns]
    if missing_cols:
        raise ValueError(f"❌ Missing required columns: {', '.join(missing_cols)}")
    print("✅ All required columns are present.")

def check_for_data_types(df, expected_types):
    for col, dtype in expected_types.items():
        if not pd.api.types.is_dtype_equal(df[col].dtype, dtype):
            raise TypeError(f"⚠️ Column '{col}' does not have the expected type: {dtype}")
    print("✅ All columns have correct data types.")

# 📌 Step 4: Handle Missing Data Upfront (Avoid Redundant dropna Calls)
df.fillna({
    'Age': df['Age'].median(),  # Fill missing 'Age' with median
    'Salary': df['Salary'].mean(),  # Fill missing 'Salary' with mean
    'Department': 'Unknown'  # Fill missing 'Department' with 'Unknown'
}, inplace=True)

print("\n🔧 After Handling Missing Data:")
display(df)

# 📌 Step 5: Detect Duplicates
duplicates = df[df.duplicated()]
if not duplicates.empty:
    print("\n🔁 Duplicate Rows Found:")
    display(duplicates)
else:
    print("\n✅ No duplicate rows found.")

# 📌 Step 6: Detect Outliers in Numerical Columns (using IQR)
def detect_outliers_iqr(column):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return column[(column < lower) | (column > upper)]

print("\n📊 Outliers in Age:")
outliers_age = detect_outliers_iqr(df['Age'])
if not outliers_age.empty:
    display(outliers_age)
else:
    print("✅ No outliers detected in 'Age'.")

print("\n📊 Outliers in Salary:")
outliers_salary = detect_outliers_iqr(df['Salary'])
if not outliers_salary.empty:
    display(outliers_salary)
else:
    print("✅ No outliers detected in 'Salary'.")

# 📌 Step 7: Schema Validation (more robust)
expected_columns = {
    'Name': str,
    'Age': (int, float),
    'Salary': (int, float),
    'Department': str
}

print("\n🧾 Schema Validation:")
try:
    check_for_missing_columns(df, expected_columns.keys())
    check_for_data_types(df, expected_columns)
except (ValueError, TypeError) as e:
    print(f"❌ Error: {e}")

# 📌 Step 8: Categorical Consistency Check
expected_departments = ['HR', 'Finance', 'IT', 'Marketing']
invalid_departments = df[~df['Department'].isin(expected_departments)]

if not invalid_departments.empty:
    print("\n🚨 Invalid Department Entries:")
    display(invalid_departments[['Name', 'Department']])
else:
    print("✅ All departments are valid.")

# 📌 Final Summary
print("\n🎯 Data Quality Checks Completed.")


✅ Sample Data:


Unnamed: 0,Name,Age,Salary,Department
0,Alice,25.0,50000.0,HR
1,Bob,,60000.0,Finance
2,Charlie,35.0,,IT
3,David,40.0,70000.0,IT
4,Eve,-5.0,55000.0,Unknown
5,Alice,25.0,50000.0,HR



🔧 After Handling Missing Data:


Unnamed: 0,Name,Age,Salary,Department
0,Alice,25.0,50000.0,HR
1,Bob,25.0,60000.0,Finance
2,Charlie,35.0,57000.0,IT
3,David,40.0,70000.0,IT
4,Eve,-5.0,55000.0,Unknown
5,Alice,25.0,50000.0,HR



🔁 Duplicate Rows Found:


Unnamed: 0,Name,Age,Salary,Department
5,Alice,25.0,50000.0,HR



📊 Outliers in Age:


4   -5.0
Name: Age, dtype: float64


📊 Outliers in Salary:
✅ No outliers detected in 'Salary'.

🧾 Schema Validation:
✅ All required columns are present.
❌ Error: ⚠️ Column 'Name' does not have the expected type: <class 'str'>

🚨 Invalid Department Entries:


Unnamed: 0,Name,Department
4,Eve,Unknown



🎯 Data Quality Checks Completed.
