In [1]:
# Ques_14.ipynb - Data Quality Automation Tools

# 📌 Step 1: Import necessary libraries
import pandas as pd
import numpy as np

# Optional: install great_expectations (if allowed in your environment)
# !pip install great_expectations

# 📌 Step 2: Sample DataFrame (replace with your own data)
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Alice'],
    'Age': [25, np.nan, 35, 40, -5, 25],
    'Salary': [50000, 60000, None, 70000, 55000, 50000],
    'Department': ['HR', 'Finance', 'IT', 'IT', 'Unknown', 'HR']
}
df = pd.DataFrame(data)

print("✅ Sample Data:")
display(df)

# 📌 Step 3: Detect Missing Values
print("\n🔍 Missing Values:")
print(df.isnull().sum())

# 📌 Step 4: Detect Duplicates
print("\n🔁 Duplicate Rows:")
print(df[df.duplicated()])

# 📌 Step 5: Detect Outliers in Numerical Columns (using IQR)
def detect_outliers_iqr(column):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return column[(column < lower) | (column > upper)]

print("\n📊 Outliers in Age:")
print(detect_outliers_iqr(df['Age'].dropna()))

print("\n📊 Outliers in Salary:")
print(detect_outliers_iqr(df['Salary'].dropna()))

# 📌 Step 6: Schema Validation
expected_columns = {
    'Name': str,
    'Age': (int, float),
    'Salary': (int, float),
    'Department': str
}

print("\n🧾 Schema Validation:")
for col, dtype in expected_columns.items():
    if col not in df.columns:
        print(f"❌ Missing column: {col}")
    else:
        actual_dtype = df[col].dropna().map(type).unique()
        if not any(issubclass(t, dtype if isinstance(dtype, tuple) else (dtype,)) for t in actual_dtype):
            print(f"⚠️ Column '{col}' has incorrect data type(s): {actual_dtype}")
        else:
            print(f"✅ Column '{col}' matches expected type.")

# 📌 Step 7: Categorical Consistency Check
expected_departments = ['HR', 'Finance', 'IT', 'Marketing']
invalid_departments = df[~df['Department'].isin(expected_departments)]

print("\n🚨 Invalid Department Entries:")
print(invalid_departments[['Name', 'Department']])

# 📌 Final Summary
print("\n🎯 Data Quality Checks Completed.")


✅ Sample Data:


Unnamed: 0,Name,Age,Salary,Department
0,Alice,25.0,50000.0,HR
1,Bob,,60000.0,Finance
2,Charlie,35.0,,IT
3,David,40.0,70000.0,IT
4,Eve,-5.0,55000.0,Unknown
5,Alice,25.0,50000.0,HR



🔍 Missing Values:
Name          0
Age           1
Salary        1
Department    0
dtype: int64

🔁 Duplicate Rows:
    Name   Age   Salary Department
5  Alice  25.0  50000.0         HR

📊 Outliers in Age:
4   -5.0
Name: Age, dtype: float64

📊 Outliers in Salary:
Series([], Name: Salary, dtype: float64)

🧾 Schema Validation:
✅ Column 'Name' matches expected type.
✅ Column 'Age' matches expected type.
✅ Column 'Salary' matches expected type.
✅ Column 'Department' matches expected type.

🚨 Invalid Department Entries:
  Name Department
4  Eve    Unknown

🎯 Data Quality Checks Completed.
