In [None]:
# Ques_1.ipynb
# --------------------------------------
# Data Quality Assessment & Profiling
# --------------------------------------

# Step 1: Import Required Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Optional: For full profiling report
# !pip install ydata-profiling
from ydata_profiling import ProfileReport

# Step 2: Load the Dataset
# Replace 'your_dataset.csv' with your actual dataset path
df = pd.read_csv('your_dataset.csv')

# Step 3: Initial Data Exploration
print("First 5 rows of the dataset:")
print(df.head())

print("\nDataset shape (rows, columns):", df.shape)

print("\nData types of each column:")
print(df.dtypes)

print("\nSummary statistics:")
print(df.describe(include='all'))

# Step 4: Missing Values Check
print("\nMissing values per column:")
print(df.isnull().sum())

print("\nPercentage of missing values per column:")
print(df.isnull().mean() * 100)

# Visualize missing values
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap="viridis")
plt.title("Heatmap of Missing Values")
plt.show()

# Step 5: Duplicate Data
num_duplicates = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {num_duplicates}")

# Removing duplicates
df = df.drop_duplicates()
print("Duplicates removed.")

# Step 6: Categorical Consistency Check
print("\nUnique values in categorical columns:")
for col in df.select_dtypes(include='object').columns:
    print(f"{col}: {df[col].unique()}")

# Optional: Standardizing categorical values
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].str.lower().str.strip()

# Step 7: Outlier Detection (Numerical Columns)
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()

for col in numerical_cols:
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot for {col}')
    plt.show()

    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]

    print(f"Column '{col}' has {outliers.shape[0]} potential outliers.")

# Step 8: Generate Data Profiling Report (Optional)
profile = ProfileReport(df, title="Data Quality Profiling Report", explorative=True)
profile.to_file("data_profiling_report.html")

print("\nData profiling report generated: data_profiling_report.html")