In [None]:
# QUes_1.ipynb - Data Quality Scoring & Reporting

# Step 1: Import required libraries
import pandas as pd
import matplotlib.pyplot as plt

# Step 2: Load CSV Data
# Replace with your actual file path if needed
file_path = 'your_file.csv'  # Example: 'data/sample.csv'
df = pd.read_csv(file_path)

# Display first few rows
df.head()
# Step 3: Calculate Data Quality Metrics

# 1. Completeness: proportion of non-null values
completeness = df.notnull().mean()

# 2. Uniqueness: proportion of unique values
uniqueness = df.nunique() / len(df)

# 3. Consistency: (optional, domain-specific - here we simulate perfect consistency)
# For simplicity, assuming all consistent (1.0)
consistency = pd.Series(1.0, index=df.columns)

# Combine into a DataFrame
dqi_components = pd.DataFrame({
    'Completeness': completeness,
    'Uniqueness': uniqueness,
    'Consistency': consistency
})

# Calculate DQI per column (average of metrics)
dqi_components['DQI'] = dqi_components.mean(axis=1)

# Overall DQI score
overall_dqi = dqi_components['DQI'].mean()

print(f"Overall Data Quality Index (DQI): {overall_dqi:.2f}")
dqi_components
# Step 4: Visualize DQI Components

# Plot Completeness, Uniqueness, and DQI
dqi_components[['Completeness', 'Uniqueness', 'DQI']].plot(kind='bar', figsize=(12, 6))
plt.title('Data Quality Metrics per Column')
plt.ylabel('Score (0 to 1)')
plt.xlabel('Columns')
plt.ylim(0, 1)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y')
plt.tight_layout()
plt.show()