In [None]:
[1:29 pm, 07/05/2025] Kavya Shree: # Ques 1.ipynb - Data Quality Framework (ISO 8000-based)

import pandas as pd
import numpy as np
from datetime import datetime

# Load dataset (replace with your file path or dataset)
df = pd.read_csv("data.csv")  # Replace with your file path or method

# Initialize result dictionary
quality_report = {}

# 1. Completeness (% of non-null entries)
completeness = df.notnull().mean() * 100
quality_report['Completeness (%)'] = completeness.round(2)

# 2. Uniqueness (% of unique rows)
unique_rows = df.duplicated().sum()
uniqueness_percent = 100 * (1 - unique_rows / len(df))
quality_report['Uniqueness (%)'] = pd.Series([uniqueness_percent.round(2)] * len(df.columns), index=df.columns)

# 3. Validity (% of values conforming to data type of first non-null entry)
validity = []
for col in df.columns:
    first_valid = df[col].dropna().iloc[0] if not df[col].dropna().empty else np.nan
    expected_type = type(first_valid)
    valid_count = df[col].dropna().apply(lambda x: isinstance(x, expected_type)).sum()
    validity_percent = 100 * valid_count / df[col].notnull().sum() if df[col].notnull().sum() > 0 else 0
    validity.append(round(validity_percent, 2))
quality_report['Validity (%)'] = pd.Series(validity, index=df.columns)

# 4. Consistency (based on categorical columns having <50% unique values)
def check_consistency(col):
    if df[col].dtype == 'object' or df[col].dtype.name == 'category':
        unique_ratio = df[col].nunique() / df[col].count()
        return 100 if unique_ratio < 0.5 else 0
    return 100
consistency = {col: check_consistency(col) for col in df.columns}
quality_report['Consistency (%)'] = pd.Series(consistency)

# 5. Timeliness (% of recent entries in 'date' column if exists)
if 'date' in df.columns:
    try:
        df['date'] = pd.to_datetime(df['date'], errors='coerce')
        recent_cutoff = datetime.now() - pd.DateOffset(years=1)
        timely_percent = 100 * (df['date'] >= recent_cutoff).sum() / df['date'].notnull().sum()
        quality_report['Timeliness (%)'] = pd.Series([round(timely_percent, 2)] * len(df.columns), index=df.columns)
    except:
        quality_report['Timeliness (%)'] = pd.Series(["Invalid date format"] * len(df.columns), index=df.columns)
else:
    quality_report['Timeliness (%)'] = pd.Series(["No date column"] * len(df.columns), index=df.columns)

# Combine into a DataFrame
dq_report = pd.DataFrame(quality_report)

# Display the report
print("=== Data Quality Report (ISO 8000 Framework) ===")
print(dq_report)

# Optional: Save to CSV
dq_report.to_csv("data_quality_framework_report.csv")
[1:40 pm, 07/05/2025] Kavya Shree: # Ques_1.ipynb — Measuring Data Accuracy Using Trusted Source

import pandas as pd

# Load the datasets
company_df = pd.read_csv("company_prices.csv")   # Your company's data
trusted_df = pd.read_csv("trusted_prices.csv")   # Trusted reference data

# Display basic info (optional)
print("Company Data Sample:\n", company_df.head())
print("\nTrusted Data Sample:\n", trusted_df.head())

# Merge on product_id to align prices
merged_df = pd.merge(company_df, trusted_df, on="product_id", suffixes=('_company', '_trusted'))

# Create a new column to flag whether prices match
merged_df["price_match"] = merged_df["price_company"] == merged_df["price_trusted"]

# Accuracy Calculation
total_products = len(merged_df)
matching_prices = merged_df["price_match"].sum()
accuracy_percent = (matching_prices / total_products) * 100

# Output results
print(f"\nTotal Products Compared: {total_products}")
print(f"Matching Prices: {matching_prices}")
print(f"Accuracy: {accuracy_percent:.2f}%")

# Show mismatched records (if any)
mismatches = merged_df[~merged_df["price_match"]]
print("\nMismatched Records:\n", mismatches)

# Optional: Save results
merged_df.to_csv("price_accuracy_report.csv", index=False)
mismatches.to_csv("price_mismatches.csv", index=False)