In [5]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv(os.path.join(HOUSING_PATH, "housing.csv"))

# Part 1: Exploratory Data Analysis (EDA)
# 1. Display the first 10 rows
print("First 10 rows of the dataset:")
print(df.head(10))

# Dataset info
print("\nDataset Info:")
print(df.info())

# Summary statistics
print("\nSummary Statistics:")
print(df.describe())

# Value counts for categorical columns
if 'ocean_proximity' in df.columns:
    print("\nValue Counts for 'ocean_proximity':")
    print(df['ocean_proximity'].value_counts())

# Identify columns with missing values
missing_values = df.isnull().sum()
print("\nColumns with Missing Values:")
print(missing_values[missing_values > 0])

# Identify numerical vs categorical features
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = df.select_dtypes(exclude=[np.number]).columns.tolist()
print("\nNumerical Features:", numerical_features)
print("Categorical Features:", categorical_features)

# Identify columns with unusual distributions or outliers
print("\nColumns with Unusual Distributions or Outliers:")
for col in numerical_features:
    sns.boxplot(x=df[col])
    plt.title(f"Boxplot for {col}")
    plt.show()

# Part 2: Handling Missing Values
# Drop rows/columns with insignificant missing values
df_dropped = df.dropna(axis=0, thresh=int(0.9 * len(df.columns)))

# Median imputation for 'total_bedrooms'
imputer = SimpleImputer(strategy="median")
df['total_bedrooms'] = imputer.fit_transform(df[['total_bedrooms']])

# Create a `missing_report(df)` function
def missing_report(dataframe):
    missing_data = dataframe.isnull().sum()
    missing_percentage = (missing_data / len(dataframe)) * 100
    report = pd.DataFrame({
        "Column": missing_data.index,
        "Missing Values": missing_data.values,
        "Percentage": missing_percentage.values
    })
    return report[report["Missing Values"] > 0]

print("\nMissing Report:")
print(missing_report(df))

# Part 3: Encoding Categorical Variables
# One-Hot Encoding for 'ocean_proximity'
df_encoded = pd.get_dummies(df, columns=['ocean_proximity'], drop_first=True)
print("\nEncoded DataFrame:")
print(df_encoded.head())

# Part 4: Feature Scaling
# Apply StandardScaler and MinMaxScaler
scaler_standard = StandardScaler()
scaler_minmax = MinMaxScaler()

numerical_features_to_scale = ['median_income', 'housing_median_age', 'population', 'median_house_value']
df_scaled_standard = df_encoded.copy()
df_scaled_minmax = df_encoded.copy()

df_scaled_standard[numerical_features_to_scale] = scaler_standard.fit_transform(df_scaled_standard[numerical_features_to_scale])
df_scaled_minmax[numerical_features_to_scale] = scaler_minmax.fit_transform(df_scaled_minmax[numerical_features_to_scale])

# Plot histograms before and after scaling
for feature in numerical_features_to_scale:
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 3, 1)
    sns.histplot(df[feature], kde=True, bins=30)
    plt.title(f"Original {feature}")

    plt.subplot(1, 3, 2)
    sns.histplot(df_scaled_standard[feature], kde=True, bins=30)
    plt.title(f"Standard Scaled {feature}")

    plt.subplot(1, 3, 3)
    sns.histplot(df_scaled_minmax[feature], kde=True, bins=30)
    plt.title(f"MinMax Scaled {feature}")

    plt.tight_layout()
    plt.show()

# Part 5: Optional Feature Engineering
# Create new features
df['rooms_per_household'] = df['total_rooms'] / df['households']
df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
df['population_per_household'] = df['population'] / df['households']

print("\nNew Features Added:")
print(df[['rooms_per_household', 'bedrooms_per_room', 'population_per_household']].head())

FileNotFoundError: [Errno 2] No such file or directory: 'datasets\\housing\\housing.csv'