### Task 1: Data Profiling to Understand Data Quality
**Description**: Use basic statistical methods to profile a dataset and identify potential quality issues.

**Steps**:
1. Load the dataset using pandas in Python.
2. Understand the data by checking its basic statistics.
3. Identify null values.
4. Check unique values for categorical columns.
5. Review outliers using box plots.

In [None]:
# write your code from here corrected 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Load the dataset using pandas in Python.
# Replace this with your actual data loading
data = {
    'ID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Male'],
    'Age': [25, 30, np.nan, 40, 22, 35, 28, 32, 45, 27],
    'Income': [50000, 60000, 75000, 80000, 55000, np.nan, 65000, 70000, 90000, 52000],
    'City': ['Bangalore', 'Mumbai', 'Delhi', 'Bangalore', 'Chennai', 'Mumbai', 'Delhi', 'Bangalore', 'Mumbai', 'Chennai'],
    'Score': [85, 78, 92, 68, 80, 75, 88, 95, 70, 82],
    'Category': ['A', 'B', 'A', 'C', 'B', 'A', 'B', 'C', 'A', 'B']
}
df = pd.DataFrame(data)

print("Loaded DataFrame:")
print(df)
print("\n" + "="*30 + "\n")

# Step 2: Understand the data by checking its basic statistics.
print("Basic Statistics:")
print(df.describe())
print("\n" + "="*30 + "\n")

# Step 3: Identify null values.
print("Null Values:")
print(df.isnull().sum())
print("\n" + "="*30 + "\n")

# Step 4: Check unique values for categorical columns.
categorical_cols = df.select_dtypes(include='object').columns
print("Unique Values in Categorical Columns:")
for col in categorical_cols:
    print(f"Column '{col}': {df[col].nunique()} unique values - {df[col].unique()}")
print("\n" + "="*30 + "\n")

# Step 5: Review outliers using box plots.
numerical_cols = df.select_dtypes(include=np.number).columns
print("Box Plots for Numerical Columns:")
plt.figure(figsize=(12, 8))
for i, col in enumerate(numerical_cols):
    plt.subplot(2, len(numerical_cols) // 2 + (len(numerical_cols) % 2), i + 1)
    sns.boxplot(x=df[col])
    plt.title(f'Box Plot of {col}')
plt.tight_layout()
plt.show()


### Task 2: Implement Simple Data Validation
**Description**: Write a Python script to validate the data types and constraints of each column in a dataset.

**Steps**:
1. Define constraints for each column.
2. Validate each column based on its constraints.

In [None]:
# write your code from here corrected 
import pandas as pd
import numpy as np

def validate_data(df: pd.DataFrame, constraints: dict) -> dict:
    """
    Validates the data types and constraints of each column in a DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame to validate.
        constraints (dict): A dictionary defining the constraints for each column.
                           The keys are column names, and the values are dictionaries
                           specifying the expected 'dtype' and other constraints
                           (e.g., 'min', 'max', 'allowed_values').

    Returns:
        dict: A dictionary containing the validation results for each column.
              Keys are column names, and values are lists of validation errors
              found in that column.
    """
    validation_errors = {}
    for col, rules in constraints.items():
        validation_errors[col] = []
        if col not in df.columns:
            validation_errors[col].append(f"Column '{col}' not found in DataFrame.")
            continue

        series = df[col]

        # Validate data type
        expected_dtype = rules.get('dtype')
        if expected_dtype:
            try:
                if expected_dtype == 'int':
                    if not pd.api.types.is_integer_dtype(series):
                        validation_errors[col].append(f"Column '{col}' should be of integer type, but is {series.dtype}.")
                elif expected_dtype == 'float':
                    if not pd.api.types.is_float_dtype(series):
                        validation_errors[col].append(f"Column '{col}' should be of float type, but is {series.dtype}.")
                elif expected_dtype == 'datetime':
                    try:
                        pd.to_datetime(series, errors='raise')
                    except Exception:
                        validation_errors[col].append(f"Column '{col}' should be of datetime type, but contains invalid formats.")
                elif expected_dtype == 'category':
                    pass # pandas 'category' dtype can handle various underlying types
                elif expected_dtype == 'object':
                    pass # 'object' is a general type, further constraints might be needed
                elif expected_dtype != str(series.dtype):
                    validation_errors[col].append(f"Column '{col}' should be of type '{expected_dtype}', but is '{series.dtype}'.")
            except AttributeError:
                validation_errors[col].append(f"Invalid dtype specified for column '{col}': {expected_dtype}")

        # Validate minimum value
        min_val = rules.get('min')
        if min_val is not None and pd.api.types.is_numeric_dtype(series):
            invalid_min = series[series < min_val]
            if not invalid_min.empty:
                validation_errors[col].append(f"Values in column '{col}' are below the minimum allowed value ({min_val}): {invalid_min.tolist()}")

        # Validate maximum value
        max_val = rules.get('max')
        if max_val is not None and pd.api.types.is_numeric_dtype(series):
            invalid_max = series[series > max_val]
            if not invalid_max.empty:
                validation_errors[col].append(f"Values in column '{col}' exceed the maximum allowed value ({max_val}): {invalid_max.tolist()}")

        # Validate allowed values (for categorical columns)
        allowed_values = rules.get('allowed_values')
        if allowed_values and pd.api.types.is_object_dtype(series):
            invalid_values = series[~series.isin(allowed_values)]
            if not invalid_values.empty:
                validation_errors[col].append(f"Values in column '{col}' are not in the allowed set ({allowed_values}): {invalid_values.unique().tolist()}")

        # Add more validation rules as needed (e.g., regex patterns, unique values)

        if not validation_errors[col]:
            validation_errors[col].append("Column passed validation.")

    return validation_errors

# Step 1: Define constraints for each column.
data_constraints = {
    'ID': {'dtype': 'int', 'min': 1},
    'Gender': {'dtype': 'category', 'allowed_values': ['Male', 'Female']},
    'Age': {'dtype': 'int', 'min': 18, 'max': 100},
    'Income': {'dtype': 'float', 'min': 0},
    'City': {'dtype': 'object'},
    'Score': {'dtype': 'int', 'min': 0, 'max': 100},
    'Category': {'dtype': 'category', 'allowed_values': ['A', 'B', 'C']}
}

# Step 2: Validate each column based on its constraints.
# Load a sample DataFrame (replace with your actual loading)
data = {
    'ID': [1, 2, 0, 4, 5],
    'Gender': ['Male', 'Female', 'Other', 'Female', 'Male'],
    'Age': [25, 15, 30, 110, 22],
    'Income': [50000.0, -100, 75000.0, 80000.0, np.nan],
    'City': ['Bangalore', 'Mumbai', 'Delhi', 'Bangalore', 'Chennai'],
    'Score': [85, 78, 105, 68, 80],
    'Category': ['A', 'B', 'D', 'C', 'B']
}
df = pd.DataFrame(data)

validation_results = validate_data(df, data_constraints)

print("Data Validation Results:")
for column, errors in validation_results.items():
    print(f"\nColumn '{column}':")
    for error in errors:
        print(f"- {error}")

### Task 3: Detect Missing Data Patterns
**Description**: Analyze and visualize missing data patterns in a dataset.

**Steps**:
1. Visualize missing data using a heatmap.
2. Identify patterns in missing data.

In [None]:
# write your code from here corrected 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
def visualize_missing_heatmap(df: pd.DataFrame):
    """
    Visualizes missing data in a DataFrame using a heatmap.

    Args:
        df (pd.DataFrame): The input DataFrame.
    """
    plt.figure(figsize=(10, 8))
    sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
    plt.title('Missing Data Heatmap')
    plt.show()
def identify_missing_patterns(df: pd.DataFrame):
    """
    Identifies potential patterns in missing data.

    Args:
        df (pd.DataFrame): The input DataFrame.
    """
    null_counts = df.isnull().sum().sort_values(ascending=False)
    total_rows = len(df)
    missing_percentage = (null_counts / total_rows) * 100

    print("Missing Data Summary:")
    missing_info = pd.DataFrame({'Missing Count': null_counts, 'Missing Percentage': missing_percentage})
    print(missing_info[missing_info['Missing Count'] > 0])
    print("\n" + "="*30 + "\n")

    # Investigate co-occurrence of missing values
    missing_matrix = df.isnull()
    missing_combinations = {}
    for i, row in missing_matrix.iterrows():
        missing_cols = tuple(row[row].index.sort_values().tolist())
        if missing_cols:
            missing_combinations[missing_cols] = missing_combinations.get(missing_cols, 0) + 1

    if missing_combinations:
        print("Co-occurrence of Missing Values:")
        for cols, count in sorted(missing_combinations.items(), key=lambda item: item[1], reverse=True):
            print(f"Columns '{cols}': {count} missing rows")
        print("\n" + "="*30 + "\n")
    else:
        print("No co-occurrence of missing values found.\n" + "="*30 + "\n")

    # Further analysis: Check if missingness in one column affects another
    for col1 in df.columns:
        for col2 in df.columns:
            if col1 != col2:
                missing_in_col1 = df[df[col1].isnull()]
                percentage_missing_in_col2_when_col1_missing = (missing_in_col1[col2].isnull().sum() / len(missing_in_col1)) * 100 if len(missing_in_col1) > 0 else 0
                if percentage_missing_in_col2_when_col1_missing > 0:
                    print(f"Percentage of missing values in '{col2}' when '{col1}' is missing: {percentage_missing_in_col2_when_col1_missing:.2f}%")
    print("\n" + "="*30 + "\n")

# Sample DataFrame with missing data
data_missing = {
    'ID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Age': [25, np.nan, 30, np.nan, 22, 35, 28, 32, 45, 27],
    'Income': [50000, 60000, np.nan, 80000, 55000, np.nan, 65000, 70000, 90000, 52000],
    'Education': ['Bachelor', 'Master', 'PhD', 'Bachelor', np.nan, 'Master', 'Bachelor', 'PhD', np.nan, 'Bachelor'],
    'Occupation': ['Engineer', np.nan, 'Scientist', 'Analyst', 'Engineer', np.nan, 'Scientist', 'Analyst', 'Manager', 'Engineer'],
    'City': ['Bangalore', 'Mumbai', 'Delhi', 'Bangalore', 'Chennai', 'Mumbai', 'Delhi', np.nan, 'Mumbai', 'Chennai'],
    'Score': [85, 78, 92, 68, 80, 75, np.nan, 95, 70, 82]
}
df_missing = pd.DataFrame(data_missing)

# Step 1: Visualize missing data
visualize_missing_heatmap(df_missing.copy())

# Step 2: Identify patterns in missing data
identify_missing_patterns(df_missing.copy())

### Task 4: Integrate Automated Data Quality Checks
**Description**: Integrate automated data quality checks using the Great Expectations library for a dataset.

**Steps**:
1. Install and initialize Great Expectations.
2. Set up Great Expectations.
3. Add further checks and validate.