In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Read CSV file (replace with your actual file path if needed)
df = pd.read_csv('data.csv')  # Ensure 'data.csv' is in the same folder or give full path

# Functions to calculate Data Quality Metrics
def calculate_completeness(column):
    return column.notnull().sum() / len(column)

def calculate_uniqueness(column):
    return column.nunique() / len(column)

def calculate_consistency(column):
    if column.dtype == 'object':
        return column.str.strip().str.lower().value_counts(normalize=True).max()
    return np.nan  # Consistency mostly applies to categorical/object data

def calculate_validity(column):
    # Placeholder: Return NaN unless you define specific rules
    return np.nan

# Calculate metrics and DQI
metrics = []
for col in df.columns:
    completeness = calculate_completeness(df[col])
    uniqueness = calculate_uniqueness(df[col])
    consistency = calculate_consistency(df[col])
    validity = calculate_validity(df[col])

    # DQI as average of available metrics (ignores NaNs)
    available = [v for v in [completeness, uniqueness, consistency, validity] if not pd.isna(v)]
    dqi = sum(available) / len(available) if available else np.nan

    metrics.append({
        'Column': col,
        'Completeness': completeness,
        'Uniqueness': uniqueness,
        'Consistency': consistency,
        'Validity': validity,
        'DQI': dqi
    })

# Create DataFrame for metrics
dqi_df = pd.DataFrame(metrics)

# Display DQI table
print("Data Quality Metrics:\n", dqi_df)

# Visualization of DQI
sns.set(style="whitegrid")
plt.figure(figsize=(12, 6))
sns.barplot(x='Column', y='DQI', data=dqi_df, palette='viridis')
plt.title('Data Quality Index (DQI) per Column')
plt.ylabel('DQI Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

FileNotFoundError: [Errno 2] No such file or directory: 'data.csv'