In [18]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import json
import os
from datetime import datetime
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder

# Configure logging
logging.basicConfig(
    filename='advanced_data_quality.log',
    level=logging.INFO,
    format='%(asctime)s:%(levelname)s:%(message)s'
)

# Load the dataset
try:
    df = pd.read_csv('your_dataset.csv')  # Replace with your dataset path
    logging.info("Dataset loaded successfully.")
except Exception as e:
    logging.error(f"Error loading dataset: {e}")
    raise

# Data Profiling
def data_profiling(data):
    profile = {
        'shape': data.shape,
        'columns': data.columns.tolist(),
        'dtypes': data.dtypes.apply(lambda x: x.name).to_dict(),
        'missing_values': data.isnull().sum().to_dict(),
        'duplicate_records': int(data.duplicated().sum()),
        'summary_statistics': data.describe(include='all').to_dict()
    }
    return profile

profiling_report = data_profiling(df)
with open('data_profiling_report.json', 'w') as f:
    json.dump(profiling_report, f, indent=4)
logging.info("Data profiling completed and report saved.")

# Data Validation Rules
def validate_data(data):
    issues = {}

    # Rule 1: Check for missing values
    missing = data.isnull().sum()
    issues['missing_values'] = missing[missing > 0].to_dict()

    # Rule 2: Check for duplicate records
    duplicates = data.duplicated().sum()
    issues['duplicate_records'] = int(duplicates)

    # Rule 3: Check for out-of-range values (example: age should be between 0 and 120)
    if 'age' in data.columns:
        out_of_range = data[(data['age'] < 0) | (data['age'] > 120)].shape[0]
        issues['out_of_range_age'] = int(out_of_range)

    # Rule 4: Check for invalid categorical entries (example: gender should be 'Male' or 'Female')
    if 'gender' in data.columns:
        valid_genders = ['Male', 'Female']
        invalid_genders = data[~data['gender'].isin(valid_genders)].shape[0]
        issues['invalid_genders'] = int(invalid_genders)

    return issues

validation_issues = validate_data(df)
with open('data_validation_issues.json', 'w') as f:
    json.dump(validation_issues, f, indent=4)
logging.info("Data validation completed and issues report saved.")

# Data Cleaning
def clean_data(data):
    # Drop duplicate records
    data = data.drop_duplicates()

    # Handle missing values (example: drop rows with missing values)
    data = data.dropna()

    # Remove out-of-range age values
    if 'age' in data.columns:
        data = data[(data['age'] >= 0) & (data['age'] <= 120)]

    # Filter valid gender entries
    if 'gender' in data.columns:
        data = data[data['gender'].isin(['Male', 'Female'])]

    return data

df_cleaned = clean_data(df)
logging.info("Data cleaning completed.")

# Anomaly Detection using Isolation Forest
def detect_anomalies(data, features):
    # Encode categorical variables
    for col in features:
        if data[col].dtype == 'object':
            le = LabelEncoder()
            data[col] = le.fit_transform(data[col])

    iso_forest = IsolationForest(contamination=0.01, random_state=42)
    data['anomaly'] = iso_forest.fit_predict(data[features])
    anomalies = data[data['anomaly'] == -1]
    return anomalies

# Specify features for anomaly detection
anomaly_features = ['age']  # Add more numerical features as needed
anomalies = detect_anomalies(df_cleaned.copy(), anomaly_features)
anomalies.to_csv('anomalies.csv', index=False)
logging.info(f"Anomaly detection completed. {anomalies.shape[0]} anomalies found and saved.")

# Generate Data Quality Report
def generate_report(profiling, validation, anomalies):
    report = {
        'profiling': profiling,
        'validation_issues': validation,
        'anomalies_detected': anomalies.shape[0],
        'report_generated_at': datetime.now().isoformat()
    }
    with open('data_quality_report.json', 'w') as f:
        json.dump(report, f, indent=4)
    logging.info("Data quality report generated and saved.")

generate_report(profiling_report, validation_issues, anomalies)

# Visualizations
def visualize_data(data):
    # Histogram of age
    if 'age' in data.columns:
        plt.figure(figsize=(10, 6))
        sns.histplot(data['age'], bins=30, kde=True)
        plt.title('Age Distribution')
        plt.xlabel('Age')
        plt.ylabel('Frequency')
        plt.tight_layout()
        plt.savefig('age_distribution.png')
        plt.close()
        logging.info("Age distribution plot saved.")

    # Bar plot of gender
    if 'gender' in data.columns:
        plt.figure(figsize=(6, 4))
        sns.countplot(x='gender', data=data)
        plt.title('Gender Count')
        plt.xlabel('Gender')
        plt.ylabel('Count')
        plt.tight_layout()
        plt.savefig('gender_count.png')
        plt.close()
        logging.info("Gender count plot saved.")

visualize_data(df_cleaned)

ERROR:root:Error loading dataset: [Errno 2] No such file or directory: 'your_dataset.csv'


FileNotFoundError: [Errno 2] No such file or directory: 'your_dataset.csv'