In [None]:
# Question: Advanced Data Profiling and Outlier Detection
# Description: Perform detailed data profiling including outlier detection for numeric columns.

# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import zscore, iqr

# Load your dataset
df = pd.read_csv('your_dataset.csv')  # Replace with actual path

# Select numeric columns only
numeric_df = df.select_dtypes(include=np.number)

# --- Data Profiling ---
print("📊 Descriptive Statistics:\n")
display(numeric_df.describe().T)

# --- Outlier Detection using Z-score ---
print("\n🔍 Outlier Detection (Z-score method):")
z_scores = np.abs(zscore(numeric_df))
z_outliers = (z_scores > 3)
outlier_counts_z = pd.Series(np.sum(z_outliers, axis=0), index=numeric_df.columns)
display(outlier_counts_z[outlier_counts_z > 0])

# --- Outlier Detection using IQR ---
print("\n🔍 Outlier Detection (IQR method):")
outlier_counts_iqr = {}

for column in numeric_df.columns:
    q1 = np.percentile(numeric_df[column], 25)
    q3 = np.percentile(numeric_df[column], 75)
    iqr_value = q3 - q1
    lower_bound = q1 - 1.5 * iqr_value
    upper_bound = q3 + 1.5 * iqr_value
    outliers = numeric_df[(numeric_df[column] < lower_bound) | (numeric_df[column] > upper_bound)]
    outlier_counts_iqr[column] = outliers.shape[0]

display(pd.Series(outlier_counts_iqr).sort_values(ascending=False))

# --- Visualization: Boxplots & Histograms ---
for column in numeric_df.columns:
    plt.figure(figsize=(12, 4))

    # Histogram
    plt.subplot(1, 2, 1)
    sns.histplot(numeric_df[column], kde=True, bins=30, color='skyblue')
    plt.title(f'Histogram of {column}')

    # Boxplot
    plt.subplot(1, 2, 2)
    sns.boxplot(x=numeric_df[column], color='orange')
    plt.title(f'Boxplot of {column}')

    plt.tight_layout()
    plt.show()
