# California Housing Analysis

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    classification_report,
    roc_curve,
    roc_auc_score
)

sns.set_context("notebook")

## Load Data

In [4]:
df = pd.read_csv("data/housing.csv")

df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


## Create binary dataset

In [6]:
# Binary y/target creation from top 20% of median_house_value
threshold = df["median_house_value"].quantile(0.80)
y = (df["median_house_value"] >= threshold).astype(int)
y = pd.Series(y, name="high_value")

# Remove target column from features
X = df.drop(columns=["median_house_value"])

# Naming classes (for humans)
target_names = np.array(["not_high_value", "high_value"])

print("X shape:", X.shape)
print("y shape:", y.shape)
print("Mapping 0/1 -> class_name:", dict(enumerate(target_names)))

print("\nClass distribution in percentage:")
print(y.value_counts(normalize=True))

X shape: (20640, 9)
y shape: (20640,)
Mapping 0/1 -> class_name: {0: np.str_('not_high_value'), 1: np.str_('high_value')}

Class distribution in percentage:
high_value
0    0.799903
1    0.200097
Name: proportion, dtype: float64
