In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
sns.set_style("whitegrid")

df = pd.read_csv(r"C:\Users\user\Downloads\KC_BL_DATASET.csv")
df.head()


In [2]:
print("Shape:", df.shape)
print("\nInfo:")
print(df.info())
print("\nSummary Statistics:")
print(df.describe(include="all"))


In [3]:
print("Missing Values:\n", df.isnull().sum())
print("\nMissing %:\n", (df.isnull().mean()*100).round(2))
print("\nDuplicate Rows:", df.duplicated().sum())


In [4]:
# Numerical distributions
df.hist(figsize=(12,8), bins=30)
plt.suptitle("Numerical Feature Distributions", fontsize=14)
plt.show()

# Categorical distributions
for col in df.select_dtypes(include=["object"]).columns:
    plt.figure(figsize=(6,4))
    sns.countplot(x=col, data=df, palette="Set2")
    plt.title(f"Count Plot of {col}")
    plt.xticks(rotation=45)
    plt.show()


In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()


In [None]:
categorical_cols = df.select_dtypes(include=["object"]).columns

for col in categorical_cols:
    for num_col in df.select_dtypes(include=np.number).columns:
        plt.figure(figsize=(6,4))
        sns.boxplot(x=col, y=num_col, data=df, palette="Set3")
        plt.title(f"{num_col} by {col}")
        plt.xticks(rotation=45)
        plt.show()


In [None]:
for col in df.select_dtypes(include=np.number).columns:
    plt.figure(figsize=(6,4))
    sns.boxplot(x=df[col], color="orange")
    plt.title(f"Boxplot of {col}")
    plt.show()


In [None]:
print(f" Final EDA Insights")
print(f"1. Dataset contains {df.shape[0]} rows and {df.shape[1]} columns.")
print(f"2. Missing values detected in {df.isnull().any().sum()} columns.")
print(f"3. Duplicate rows: {df.duplicated().sum()}")
print(f"4. Numerical features show varied distributions (see histograms).")
print(f"5. Some categorical features are imbalanced (count plots).")
print(f"6. Correlations highlight relationships between numeric variables.")
print(f"7. Outliers present in certain numerical columns (boxplots).")
