In [None]:
# data_preprocessing.py

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler

# ------------------------------
# 1. Loading the Dataset and performing initial EDA
# ------------------------------
housing = fetch_california_housing(as_frame=True)
df = housing.frame.copy()  # Copy the DataFrame to avoid modifying original data
df['MedHouseVal'] = housing.target  # Target column -> median house value

print("First five rows of the dataset:")
print(df.head())

print("\nDataset Information:")
print(df.info())

print("\nSummary Statistics:")
print(df.describe())

print("\nMissing Values per Column:")
print(df.isnull().sum())

# ------------------------------
# 2. Handle Missing Values
# ------------------------------
# Filling any missing numeric values with the median.
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
for col in numeric_cols:
    if df[col].isnull().sum() > 0:
        median_val = df[col].median()
        df[col].fillna(median_val, inplace=True)
        print(f"Filled missing values in {col} with median: {median_val}")

# ------------------------------
# 3. Doing Feature Engineering
# ------------------------------
# 3a. Encoding Categorical Variables (if present)
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
if categorical_cols:
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
    print("\nCategorical variables encoded using one-hot encoding.")

# 3b. Scaling Numerical Features
scaler = StandardScaler()
df_scaled = df.copy()
df_scaled[numeric_cols] = scaler.fit_transform(df[numeric_cols])
print("\nNumerical features have been scaled using StandardScaler.")

# 3c. Doing Feature Selection
# Selecting features with an absolute correlation > 0.1 with the target.
correlations = df_scaled.corr()['MedHouseVal'].abs()
selected_features = correlations[correlations > 0.1].index.tolist()
print("\nSelected features based on correlation threshold (abs(correlation) > 0.1):")
print(selected_features)

# ------------------------------
# 4. Performing Exploratory Data Analysis (EDA)
# ------------------------------
# Plot histograms for all original features
df.hist(bins=30, figsize=(12, 10))
plt.tight_layout()
plt.show()

# Ploting correlation heatmap of the scaled features
plt.figure(figsize=(10, 8))
correlation_matrix = df_scaled.corr()
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
plt.title("Feature Correlation Heatmap")
plt.show()