In [None]:
import pandas as pd
import numpy as np

def data_quality_check(file_path, report_file="data_quality_report.txt"):
    
    # Load dataset
    try:
        data = pd.read_csv(file_path)
    except Exception as e:
        return f"Error loading file: {e}"

    # Initialize the report
    report = "--- Data Quality Summary Report ---\n\n"

    # 1. Missing Values Check
    missing_values = data.isnull().sum()
    missing_summary = missing_values[missing_values > 0]
    report += "1. Missing Values:\n"
    if missing_summary.empty:
        report += "   No missing values detected.\n"
    else:
        report += f"{missing_summary}\n\n"

    # 2. Duplicate Rows Check
    duplicate_rows = data[data.duplicated()]
    num_duplicates = len(duplicate_rows)
    report += "2. Duplicate Rows:\n"
    report += f"   Number of duplicate rows: {num_duplicates}\n\n"

    # 3. Outlier Detection (IQR Method)
    outlier_summary = {}
    for col in data.select_dtypes(include=np.number).columns:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)]
        outlier_summary[col] = len(outliers)
    
    report += "3. Outliers Detected:\n"
    for col, count in outlier_summary.items():
        report += f"   {col}: {count} outlier(s)\n"
    if not outlier_summary:
        report += "   No numerical columns found for outlier detection.\n"

    # Save the report to a file
    with open(report_file, "w", encoding="utf-8") as file:
        file.write(report)

    print(f"Data Quality Check complete. Summary saved to '{report_file}'.")
    return report

# Example usage
if __name__ == "__main__":
    file_path = "wine.csv"  # Replace with your dataset
    report = data_quality_check(file_path)
    print(report)


Data Quality Check complete. Summary saved to 'data_quality_report.txt'.
--- Data Quality Summary Report ---

1. Missing Values:
   No missing values detected.
2. Duplicate Rows:
   Number of duplicate rows: 1177

3. Outliers Detected:
   fixed acidity: 357 outlier(s)
   volatile acidity: 377 outlier(s)
   citric acid: 509 outlier(s)
   residual sugar: 118 outlier(s)
   chlorides: 286 outlier(s)
   free sulfur dioxide: 62 outlier(s)
   total sulfur dioxide: 10 outlier(s)
   density: 3 outlier(s)
   pH: 73 outlier(s)
   sulphates: 191 outlier(s)
   alcohol: 3 outlier(s)
   quality: 228 outlier(s)

