In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# =========================
# 1) Load Dataset
# =========================
# Change file name here
file_path = "dataset.csv"   # example: "train.csv" (house prices) OR "KaggleV2-May-2016.csv"

df = pd.read_csv(file_path)

print("âœ… Dataset Loaded Successfully!")
print("Shape:", df.shape)
display(df.head())

# =========================
# 2) Identify Missing Values
# =========================
missing_count = df.isnull().sum()
missing_percent = (missing_count / len(df)) * 100

missing_table = pd.DataFrame({
    "Missing Count": missing_count,
    "Missing %": missing_percent
}).sort_values(by="Missing Count", ascending=False)

print("\nðŸ“Œ Missing Values Summary:")
display(missing_table[missing_table["Missing Count"] > 0])

# =========================
# 3) Visualize Missing Values (Bar Chart)
# =========================
missing_table_filtered = missing_table[missing_table["Missing Count"] > 0]

plt.figure(figsize=(12,5))
plt.bar(missing_table_filtered.index, missing_table_filtered["Missing Count"])
plt.xticks(rotation=90)
plt.title("Missing Values Count per Column")
plt.xlabel("Columns")
plt.ylabel("Missing Count")
plt.show()

# =========================
# 4) Drop Columns with Extremely High Missing Values
# =========================
threshold = 50  # 50% missing threshold (you can change)
cols_to_drop = missing_table[missing_table["Missing %"] > threshold].index.tolist()

print("\nðŸ—‘ Dropping columns with > 50% missing values:")
print(cols_to_drop)

df_clean = df.drop(columns=cols_to_drop)

print("\nShape after dropping high-missing columns:", df_clean.shape)

# =========================
# 5) Separate Numerical & Categorical Columns
# =========================
num_cols = df_clean.select_dtypes(include=["int64", "float64"]).columns
cat_cols = df_clean.select_dtypes(include=["object"]).columns

print("\nNumerical Columns:", len(num_cols))
print("Categorical Columns:", len(cat_cols))

# =========================
# 6) Fill Missing Values
# =========================

# (A) Numerical: mean/median imputation
for col in num_cols:
    if df_clean[col].isnull().sum() > 0:
        median_value = df_clean[col].median()
        df_clean[col] = df_clean[col].fillna(median_value)

# (B) Categorical: mode imputation
for col in cat_cols:
    if df_clean[col].isnull().sum() > 0:
        mode_value = df_clean[col].mode()[0]
        df_clean[col] = df_clean[col].fillna(mode_value)

print("\nâœ… Missing values handled!")

# =========================
# 7) Validate Dataset After Cleaning
# =========================
print("\nðŸ“Œ Missing Values After Cleaning:")
print(df_clean.isnull().sum().sum(), "total missing values remaining")

# =========================
# 8) Compare Before vs After
# =========================
print("\nðŸ“Š Before Cleaning:")
print("Rows:", df.shape[0], "Columns:", df.shape[1])

print("\nðŸ“Š After Cleaning:")
print("Rows:", df_clean.shape[0], "Columns:", df_clean.shape[1])

# =========================
# 9) Save Cleaned Dataset
# =========================
output_file = "cleaned_dataset.csv"
df_clean.to_csv(output_file, index=False)

print("\nâœ… Cleaned dataset saved as:", output_file)


ModuleNotFoundError: No module named 'pandas'