In [None]:
# Ques 4 - Data Quality Assessment & Profiling

# Import necessary libraries
import pandas as pd
import numpy as np
from scipy import stats

# Optional: Uncomment if using profiling
# !pip install ydata-profiling
from ydata_profiling import ProfileReport

# Load the dataset
# Replace with your actual dataset file path
df = pd.read_csv("your_dataset.csv")

# 1. Initial Exploration
print("Dataset Shape:", df.shape)
display(df.head())
print("\nData Info:")
df.info()
print("\nSummary Statistics:")
display(df.describe(include='all'))

# 2. Missing Values Check
print("\nMissing Values (Total):")
print(df.isnull().sum())
print("\nMissing Values (%):")
print(df.isnull().mean() * 100)

# 3. Duplicate Records
print("\nDuplicate Records Count:", df.duplicated().sum())
if df.duplicated().sum() > 0:
    print("\nDuplicate Rows:")
    display(df[df.duplicated()])

# 4. Data Types
print("\nData Types of Each Column:")
print(df.dtypes)

# 5. Convert Data Types (example: dates)
# Uncomment and replace 'date_column' if applicable
# df['date_column'] = pd.to_datetime(df['date_column'], errors='coerce')

# 6. Inconsistencies in Categorical Columns
print("\nUnique Values in Categorical Columns:")
for col in df.select_dtypes(include='object').columns:
    print(f"{col}: {df[col].unique()}")

# 7. Outlier Detection Using Z-Score
numeric_df = df.select_dtypes(include=[np.number])
z_scores = np.abs(stats.zscore(numeric_df, nan_policy='omit'))
outliers = (z_scores > 3)
print("\nRows with Potential Outliers:")
display(df[outliers.any(axis=1)])

# 8. Data Profiling Report
print("\nGenerating Data Profiling Report...")
profile = ProfileReport(df, title="Data Profiling Report", explorative=True)
profile.to_file("data_profiling_report.html")
print("Report saved as 'data_profiling_report.html'.")

# End of Notebook