In [1]:
# Data Drift Impact on Model
# Question: Use a simple linear regression model to demonstrate how data drift affects model predictions.

# 1. Train a model on the original data:
# 2. Evaluate on the drifted data:
# 3. Compare errors:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# 1. Original data (training)
np.random.seed(42)
X_train = np.random.rand(100, 1) * 10
y_train = 2 * X_train.squeeze() + 1 + np.random.randn(100) * 2

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# 2. Drifted data (shifted feature distribution)
X_drifted = np.random.rand(100, 1) * 10 + 5  # Shifted distribution
y_drifted = 2 * X_drifted.squeeze() + 1 + np.random.randn(100) * 2

# Evaluate on original and drifted data
y_pred_train = model.predict(X_train)
y_pred_drifted = model.predict(X_drifted)

# 3. Compare errors
train_error = mean_squared_error(y_train, y_pred_train)
drifted_error = mean_squared_error(y_drifted, y_pred_drifted)

print(f"Training MSE: {train_error:.2f}")
print(f"Drifted data MSE: {drifted_error:.2f}")



Training MSE: 3.23
Drifted data MSE: 3.65


In [2]:
# Monitoring Data Distribution Changes
# Question: Use Python to monitor distribution changes in features to detect potential data drift.

# 1. Calculate feature statistics (mean and standard deviation) for both original and drifted data:
# 2. Compare statistics:
# 3. Set thresholds to detect significant drift:

import numpy as np
from scipy.stats import ks_2samp

# 1. Original and drifted data
np.random.seed(42)
original_data = np.random.normal(0, 1, 1000)
drifted_data = np.random.normal(1, 1.2, 1000)  # Mean and std deviation changed

# 2. Calculate statistics
original_mean, original_std = np.mean(original_data), np.std(original_data)
drifted_mean, drifted_std = np.mean(drifted_data), np.std(drifted_data)

# 3. Compare distributions and detect drift
ks_stat, p_value = ks_2samp(original_data, drifted_data)

# Set thresholds
MEAN_THRESHOLD = 0.5
STD_THRESHOLD = 0.3
KS_P_THRESHOLD = 0.05

print(f"Original mean: {original_mean:.2f}, Drifted mean: {drifted_mean:.2f}")
print(f"Original std: {original_std:.2f}, Drifted std: {drifted_std:.2f}")
print(f"KS-test p-value: {p_value:.4f}")

# Drift detection
mean_drift = abs(original_mean - drifted_mean) > MEAN_THRESHOLD
std_drift = abs(original_std - drifted_std) > STD_THRESHOLD
ks_drift = p_value < KS_P_THRESHOLD

if mean_drift or std_drift or ks_drift:
    print("Warning: Significant data drift detected!")
    if mean_drift:
        print(f"- Mean changed by {abs(original_mean - drifted_mean):.2f}")
    if std_drift:
        print(f"- Std deviation changed by {abs(original_std - drifted_std):.2f}")
    if ks_drift:
        print("- Distribution shape changed (KS-test)")
else:
    print("No significant drift detected")


Original mean: 0.02, Drifted mean: 1.09
Original std: 0.98, Drifted std: 1.20
KS-test p-value: 0.0000
- Mean changed by 1.07
- Distribution shape changed (KS-test)


In [3]:
# Automating Data Quality Checks with Python
# Question: Automate a basic data validation process using Python to ensure the dataset's
# structural integrity.

# 1. Define validation checks:
# 2. Apply validation:

import pandas as pd

def validate_data(df):
    # 1. Define validation checks
    checks = {
        'missing_values': df.isnull().sum().sum(),
        'duplicate_rows': df.duplicated().sum(),
        'negative_values': (df.select_dtypes(include=['number']) < 0).sum().sum(),
        'zero_values': (df.select_dtypes(include=['number']) == 0).sum().sum(),
        'data_types': df.dtypes
    }
    
    # 2. Apply validation and report results
    print("Data Quality Report:")
    print(f"Total rows: {len(df)}")
    print(f"Missing values: {checks['missing_values']}")
    print(f"Duplicate rows: {checks['duplicate_rows']}")
    print(f"Negative values in numeric columns: {checks['negative_values']}")
    print(f"Zero values in numeric columns: {checks['zero_values']}")
    print("\nData Types:")
    print(checks['data_types'])
    
    # Return validation status
    if checks['missing_values'] > 0 or checks['duplicate_rows'] > 0:
        return False
    return True

# Example usage
data = {
    'id': [1, 2, 3, 4, 4],
    'value': [10, -5, 0, 15, None],
    'category': ['A', 'B', 'A', 'C', 'C']
}

df = pd.DataFrame(data)
is_valid = validate_data(df)

print(f"\nData validation status: {'Passed' if is_valid else 'Failed'}")


Data Quality Report:
Total rows: 5
Missing values: 1
Duplicate rows: 0
Negative values in numeric columns: 1
Zero values in numeric columns: 1

Data Types:
id            int64
value       float64
category     object
dtype: object

Data validation status: Failed


In [None]:
# Introducing Great Expectations for Data Validation
# Question: Use Great Expectations to set up data validation checks for a dataset.

# 1. Install Great Expectations:
# 2. Create a new expectations suite:
# 3. Load data and generate expectations:



ConfigNotFoundError: Error: No great_expectations directory was found here!
    - Please check that you are in the correct directory or have specified the correct directory.
    - If you have never run Great Expectations in this project, please run `great_expectations init` to get started.


In [None]:
# Automating Constraint Checks with Python
# Question: Automate primary key and foreign key constraint checks using Python to ensure dataset compliance.
# 1. Assuming datasets exist with primary and foreign key relationships in pandas dataframes employees_df and departments_df :

import pandas as pd
from typing import Dict, Tuple, Optional

class ConstraintValidator:
    def __init__(self, employees_df: pd.DataFrame, departments_df: pd.DataFrame):
        """
        Initialize validator with employee and department DataFrames
        
        Args:
            employees_df: DataFrame containing employee records
            departments_df: DataFrame containing department records
        """
        self.employees = employees_df
        self.departments = departments_df
        
        # Configurable column names (can be modified as needed)
        self.emp_pk = 'emp_id'
        self.emp_fk = 'dept_id'
        self.dept_pk = 'dept_id'
    
    def validate_all_constraints(self) -> Dict[str, Dict]:
        """
        Run all constraint validations and return comprehensive results
        
        Returns:
            Dictionary containing validation results for all checks
        """
        return {
            'employee_pk': self._validate_primary_key(self.employees, self.emp_pk),
            'department_pk': self._validate_primary_key(self.departments, self.dept_pk),
            'foreign_key': self._validate_foreign_key()
        }
    
    def _validate_primary_key(self, df: pd.DataFrame, pk_col: str) -> Dict:
        """
        Validate primary key constraints (uniqueness and non-null)
        
        Args:
            df: DataFrame to validate
            pk_col: Primary key column name
            
        Returns:
            Dictionary with validation results
        """
        # Check for null values
        null_count = df[pk_col].isna().sum()
        
        # Check for duplicates
        duplicates = df[df[pk_col].duplicated(keep=False)]
        dup_count = len(duplicates)
        
        return {
            'is_valid': null_count == 0 and dup_count == 0,
            'null_count': int(null_count),
            'duplicate_count': dup_count,
            'duplicate_values': duplicates[pk_col].unique().tolist() if dup_count > 0 else None,
            'offending_records': duplicates.to_dict('records') if dup_count > 0 else None
        }
    
    def _validate_foreign_key(self) -> Dict:
        """
        Validate foreign key constraint (referential integrity)
        
        Returns:
            Dictionary with validation results
        """
        # Find employees with invalid department references
        valid_depts = set(self.departments[self.dept_pk].unique())
        invalid_refs = self.employees[~self.employees[self.emp_fk].isin(valid_depts)]
        invalid_count = len(invalid_refs)
        
        return {
            'is_valid': invalid_count >0 
            'invalid_count': invalid_count,
            'invalid_values': invalid_refs[self.emp_fk].unique().tolist() if invalid_count > 0 else None,
            'offending_records': invalid_refs[[self.emp_pk, self.emp_fk]].to_dict('records') if invalid_count > 0 else None
        }
    
    def generate_report(self, results: Optional[Dict] = None) -> None:
        """
        Generate a human-readable validation report
        
        Args:
            results: Optional pre-computed validation results
        """
        if results is None:
            results = self.validate_all_constraints()
        
        print("=== DATA CONSTRAINT VALIDATION REPORT ===")
        print(f"\nEmployee PK Validation ({self.emp_pk}):")
        self._print_pk_results(results['employee_pk'])
        
        print(f"\nDepartment PK Validation ({self.dept_pk}):")
        self._print_pk_results(results['department_pk'])
        
        print(f"\nForeign Key Validation ({self.emp_fk} → {self.dept_pk}):")
        self._print_fk_results(results['foreign_key'])
    
    def _print_pk_results(self, result: Dict) -> None:
        """Helper to print PK validation results"""
        if result['is_valid']:
            print("✅ Valid (no duplicates or null values)")
        else:
            print("❌ Invalid - Found:")
            if result['null_count'] > :
                print(f" - {result['null_count']} null values")
            if result['duplicate_count'] > :
                print(f" - {result['duplicate_count']} duplicates")
                print(f"   Duplicate values: {result['duplicate_values']}")
    
    def _print_fk_results(self, result: Dict) -> None:
        """Helper to print FK validation results"""
        if result['is_valid']:
            print("✅ Valid (all references exist)")
        else:
            print(f"❌ Invalid - {result['invalid_count']} broken references")
            print(f"   Invalid values: {result['invalid_values']}")
            print("\nSample of offending records:")
            for i, rec in enumerate(result['offending_records'][:3], 1):
                print(f"   {i}. Employee {rec[self.emp_pk]} → Department {rec[self.emp_fk]}")

# Example Usage
if __name__ == "__main__":
    # Sample Data with intentional violations
    departments = pd.DataFrame({
        'dept_id': [10, 20, 30],
        'dept_name': ['HR', 'Engineering', 'Finance']
    })
    
    employees = pd.DataFrame({
        'emp_id': [1, 2, 3, 4, 5, 2, None],
        'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Bob', 'Frank'],
        'dept_id': [10, 20, 30, 40, 20, 25, 30]
    })
    
    # Initialize validator
    validator = ConstraintValidator(employees, departments)
    
    # Run validations and generate report
    validation_results = validator.validate_all_constraints()
    validator.generate_report(validation_results)
    
    # Programmatic access to results
    if not all([r['is_valid'] for r in validation_results.values()]):
        print("\nALERT: Data quality issues detected!")


SyntaxError: invalid syntax (4049000360.py, line 109)

In [None]:
# Advanced Data Drift Detection using Statistical Tests
# Question: Implement Kolmogorov-Smirnov test using Python to detect data drift at a more sophisticated level.

import pandas as pd
import numpy as np
from scipy import stats

def detect_ks_drift(reference_data: pd.Series, current_data: pd.Series, alpha: float = 0.05) -> dict:
    """
    Detects data drift between two numerical datasets using the Kolmogorov-Smirnov test.

    Args:
        reference_data (pd.Series): The reference dataset (e.g., historical data).
        current_data (pd.Series): The current dataset to compare against the reference.
        alpha (float, optional): The significance level for the test. Defaults to 0.05.

    Returns:
        dict: A dictionary containing the KS statistic, p-value, and a boolean indicating drift.
    """
    ks_statistic, p_value = stats.ks_2samp(reference_data, current_data)
    drift_detected = p_value < alpha
    return {
        "ks_statistic": ks_statistic,
        "p_value": p_value,
        "drift_detected": drift_detected,
        "alpha": alpha
    }

# Example Usage:
# Generate some synthetic data
np.random.seed(42)
reference_sample = np.random.normal(loc=0, scale=1, size=1000)
current_sample_no_drift = np.random.normal(loc=0.1, scale=1.1, size=1000)
current_sample_drift = np.random.normal(loc=0.5, scale=1.5, size=1000)

# Convert to pandas Series for easier handling
reference_series = pd.Series(reference_sample)
no_drift_series = pd.Series(current_sample_no_drift)
drift_series = pd.Series(current_sample_drift)

# Detect drift when there is likely no significant drift (small changes)
drift_report_no_drift = detect_ks_drift(reference_series, no_drift_series)
print("KS Test - No Significant Drift:")
print(f"  KS Statistic: {drift_report_no_drift['ks_statistic']:.4f}")
print(f"  P-value: {drift_report_no_drift['p_value']:.4f}")
print(f"  Drift Detected (alpha={drift_report_no_drift['alpha']}): {drift_report_no_drift['drift_detected']}")

print("\n" + "="*30 + "\n")

# Detect drift when there is a more noticeable change
drift_report_drift = detect_ks_drift(reference_series, drift_series)
print("KS Test - Significant Drift:")
print(f"  KS Statistic: {drift_report_drift['ks_statistic']:.4f}")
print(f"  P-value: {drift_report_drift['p_value']:.4f}")
print(f"  Drift Detected (alpha={drift_report_drift['alpha']}): {drift_report_drift['drift_detected']}")
        
   


