In [None]:
import pandas as pd
import os
from sklearn.impute import SimpleImputer, KNNImputer
import numpy as np

# === Step 1: Load datasets ===
housing = pd.read_csv(
    "../preprocessed_data/bangalore/india_housing_prices_bangalore_cleaned.csv"
)
real_estate = pd.read_csv(
    "../preprocessed_data/bangalore/real_estate_data_combined.csv"
)
air_quality = pd.read_csv("../preprocessed_data/bangalore/air_quality_bengaluru.csv")
noise_quality = pd.read_csv(
    "../preprocessed_data/bangalore/noise_quality_bengaluru.csv"
)

# Rename keys for merging
air_quality.rename(columns={"City": "location"}, inplace=True)
housing.rename(columns={"Locality": "location"}, inplace=True)
noise_quality.rename(columns={"Station": "location"}, inplace=True)
real_estate.rename(columns={"Location": "location"}, inplace=True)

# === Step 2: Merge datasets ===
df = housing.merge(real_estate, on=["location"], how="outer")
df = df.merge(air_quality, on=["location"], how="outer")
df = df.merge(noise_quality, on=["location"], how="outer")
df.drop_duplicates(inplace=True)
df.to_csv("./merged_data.csv", index=False)

# Separate numeric & categorical
numeric_cols = df.select_dtypes(include=[np.number]).columns
categorical_cols = df.select_dtypes(exclude=[np.number]).columns
numeric_df = df[numeric_cols]
categorical_df = df[categorical_cols]

# Create output folder
output_dir = "imputed_datasets"
os.makedirs(output_dir, exist_ok=True)

# Identify fully-null numeric columns
null_only_cols = numeric_df.columns[numeric_df.isnull().all()].tolist()
valid_numeric_df = numeric_df.drop(columns=null_only_cols)

# ===== Mean imputation =====
mean_imputer = SimpleImputer(strategy="mean")
mean_imputed = pd.DataFrame(
    mean_imputer.fit_transform(valid_numeric_df),
    columns=valid_numeric_df.columns,
    index=df.index,
)
for col in null_only_cols:
    mean_imputed[col] = np.nan
mean_imputed = mean_imputed[numeric_cols]
df_mean = pd.concat([mean_imputed, categorical_df], axis=1)
df_mean.to_csv(os.path.join(output_dir, "imputed_mean.csv"), index=False)
print("Saved: imputed_mean.csv")

# ===== Median imputation =====
median_imputer = SimpleImputer(strategy="median")
median_imputed = pd.DataFrame(
    median_imputer.fit_transform(valid_numeric_df),
    columns=valid_numeric_df.columns,
    index=df.index,
)
for col in null_only_cols:
    median_imputed[col] = np.nan
median_imputed = median_imputed[numeric_cols]
df_median = pd.concat([median_imputed, categorical_df], axis=1)
df_median.to_csv(os.path.join(output_dir, "imputed_median.csv"), index=False)
print("Saved: imputed_median.csv")

# ===== Mode imputation =====
mode_imputer = SimpleImputer(strategy="most_frequent")
mode_imputed = pd.DataFrame(
    mode_imputer.fit_transform(valid_numeric_df),
    columns=valid_numeric_df.columns,
    index=df.index,
)
for col in null_only_cols:
    mode_imputed[col] = np.nan
mode_imputed = mode_imputed[numeric_cols]
df_mode = pd.concat([mode_imputed, categorical_df], axis=1)
df_mode.to_csv(os.path.join(output_dir, "imputed_mode.csv"), index=False)
print("Saved: imputed_mode.csv")

# ===== KNN imputation =====
knn_imputer = KNNImputer(n_neighbors=5)
knn_imputed = pd.DataFrame(
    knn_imputer.fit_transform(valid_numeric_df),
    columns=valid_numeric_df.columns,
    index=df.index,
)
for col in null_only_cols:
    knn_imputed[col] = np.nan
knn_imputed = knn_imputed[numeric_cols]
df_knn = pd.concat([knn_imputed, categorical_df], axis=1)
df_knn.to_csv(os.path.join(output_dir, "imputed_knn.csv"), index=False)
print("Saved: imputed_knn.csv")

print("All imputed datasets saved in:", output_dir)

Saved: imputed_mean.csv
Saved: imputed_median.csv
Saved: imputed_mode.csv
Saved: imputed_knn.csv
All imputed datasets saved in: imputed_datasets
