# Data Exploration: California Housing Dataset

## Overview

This notebook explores the California Housing dataset, which we'll use to implement linear and logistic regression from scratch. The dataset contains information about housing districts in California from the 1990 census.

## Dataset Characteristics

- **Source**: StatLib repository (originally from 1990 US Census)
- **Task**: Regression (predicting median house value)
- **Samples**: ~20,640
- **Features**: 8 numerical features
- **Target**: Median house value (continuous, in hundreds of thousands of dollars)

## Why This Dataset?

1. **Well-understood problem**: Housing price prediction is intuitive and interpretable
2. **Tabular structure**: Perfect for linear models
3. **Real-world relevance**: Practical application with clear business value
4. **Feature diversity**: Mix of geographical, demographic, and structural features
5. **Size**: Large enough to demonstrate scalability, small enough for experimentation


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

# Set style for better-looking plots
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Set random seed for reproducibility
np.random.seed(42)


In [None]:
# Load the California Housing dataset
housing = fetch_california_housing(as_frame=True)

# Extract features and target
X = housing.data
y = housing.target

print("Dataset shape:", X.shape)
print("\nFeature names:")
print(X.columns.tolist())
print("\nTarget description:")
print(housing.DESCR[:500])


In [None]:
# Display first few rows
df = X.copy()
df['MedHouseVal'] = y
print("First 5 rows:")
print(df.head())
print("\nDataset info:")
print(df.info())
print("\nBasic statistics:")
print(df.describe())


In [None]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())
print("\nNo missing values found - dataset is clean!")


In [None]:
# Visualize target distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(y, bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Median House Value (in $100,000s)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Median House Values')
axes[0].axvline(y.mean(), color='red', linestyle='--', label=f'Mean: {y.mean():.2f}')
axes[0].legend()

# Box plot
axes[1].boxplot(y, vert=True)
axes[1].set_ylabel('Median House Value (in $100,000s)')
axes[1].set_title('Box Plot of Median House Values')

plt.tight_layout()
plt.show()

print(f"Target statistics:")
print(f"Mean: {y.mean():.2f}")
print(f"Median: {np.median(y):.2f}")
print(f"Std: {y.std():.2f}")
print(f"Min: {y.min():.2f}")
print(f"Max: {y.max():.2f}")


In [None]:
# Correlation matrix
correlation_matrix = df.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

# Show correlations with target
target_correlations = correlation_matrix['MedHouseVal'].sort_values(ascending=False)
print("\nCorrelations with Median House Value:")
print(target_correlations)


In [None]:
# Feature distributions
fig, axes = plt.subplots(2, 4, figsize=(18, 8))
axes = axes.flatten()

for idx, feature in enumerate(X.columns):
    axes[idx].hist(X[feature], bins=30, edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'{feature}')
    axes[idx].set_xlabel('Value')
    axes[idx].set_ylabel('Frequency')

plt.tight_layout()
plt.show()


In [None]:
# Scatter plots of top features vs target
top_features = target_correlations.abs().head(4).index.tolist()
top_features = [f for f in top_features if f != 'MedHouseVal']

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, feature in enumerate(top_features):
    axes[idx].scatter(X[feature], y, alpha=0.3, s=10)
    axes[idx].set_xlabel(feature)
    axes[idx].set_ylabel('Median House Value')
    axes[idx].set_title(f'{feature} vs Median House Value')
    
    # Add correlation coefficient
    corr = np.corrcoef(X[feature], y)[0, 1]
    axes[idx].text(0.05, 0.95, f'r = {corr:.3f}', 
                   transform=axes[idx].transAxes, 
                   verticalalignment='top',
                   bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.show()


In [None]:
# Prepare data for modeling
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X.values, y.values, test_size=0.2, random_state=42
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"Number of features: {X_train.shape[1]}")

# Save preprocessed data for use in other notebooks
import pickle

data_dict = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test,
    'feature_names': X.columns.tolist()
}

with open('../data_preprocessed.pkl', 'wb') as f:
    pickle.dump(data_dict, f)

print("\nPreprocessed data saved to '../data_preprocessed.pkl'")
