# Exploratory Data Analysis (EDA)

This notebook:
- Loads the **13-column raw training set** produced in `ida.ipynb`
- Saves geographic and correlation plots into `/images`
- Uses `analysis/preprocessing_pipeline.py` to engineer features and produce a **24-feature** processed training set
- Saves the processed dataset to `/data/train/housing_train_processed.csv`

> Note: This notebook lives in `/analysis`, so we set `ROOT = Path("..")` to refer to the project root.

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Point to project root (parent of /analysis)
ROOT = Path("..")
TRAIN = ROOT / "data" / "train"
IMAGES = ROOT / "images"

# Ensure images dir exists
IMAGES.mkdir(parents=True, exist_ok=True)

print("CWD:", Path.cwd())                               # where Jupyter kernel is executing
print("ROOT:", ROOT.resolve())                          # project root
print("Expect pipeline at:", (ROOT / "analysis" / "preprocessing_pipeline.py").resolve())
print("Expect train CSV at:", (TRAIN / "housing_train.csv").resolve())


CWD: /home/rolljake/cmse492_aml/ca_housing_project/analysis
ROOT: /home/rolljake/cmse492_aml/ca_housing_project
Expect pipeline at: /home/rolljake/cmse492_aml/ca_housing_project/analysis/preprocessing_pipeline.py
Expect train CSV at: /home/rolljake/cmse492_aml/ca_housing_project/data/train/housing_train.csv


In [2]:
train_path = TRAIN / "housing_train.csv"
assert train_path.exists(), "Missing data/train/housing_train.csv — run ida.ipynb first."
df_train = pd.read_csv(train_path)
print(df_train.shape)
df_train.head()


(16512, 12)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value,rooms_per_household,bedrooms_per_room
0,-122.42,37.8,52.0,3321.0,1115.0,1576.0,1034.0,2.0987,NEAR BAY,458300.0,3.211799,0.335742
1,-118.38,34.14,40.0,1965.0,354.0,666.0,357.0,6.0876,<1H OCEAN,483800.0,5.504202,0.180153
2,-121.98,38.36,33.0,1083.0,217.0,562.0,203.0,2.433,INLAND,101700.0,5.334975,0.200369
3,-117.11,33.75,17.0,4174.0,851.0,1845.0,780.0,2.2618,INLAND,96100.0,5.351282,0.203881
4,-118.15,33.77,36.0,4366.0,1211.0,1912.0,1172.0,3.5292,NEAR OCEAN,361800.0,3.725256,0.277371


## Geographic Visualization

Scatter of longitude vs latitude. Saved to `/images/geo_scatter.png`.


In [3]:
plt.figure()
df_train.plot(kind="scatter", x="longitude", y="latitude", alpha=0.2)
plt.title("Geo Scatter: Longitude vs Latitude (Train)")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
out_path = IMAGES / "geo_scatter.png"
plt.savefig(out_path, bbox_inches="tight")
plt.close()
print("Saved:", out_path)


Saved: ../images/geo_scatter.png


<Figure size 640x480 with 0 Axes>

## Feature Correlation Analysis

Compute a correlation matrix over numeric columns and save a heatmap to `/images/correlations.png`.


In [4]:
num_df = df_train.select_dtypes(include=[np.number])
corr = num_df.corr(numeric_only=True)

plt.figure(figsize=(8,6))
plt.imshow(corr, aspect='auto')
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.title("Numeric Correlations (Train)")
plt.tight_layout()
out_path = IMAGES / "correlations.png"
plt.savefig(out_path, bbox_inches="tight")
plt.close()
print("Saved:", out_path)


Saved: ../images/correlations.png


## Feature Engineering and Creation

Import the preprocessing pipeline from `analysis/preprocessing_pipeline.py`,
fit on the raw train data, transform into **24 features**, and save the processed dataset.

Pipeline steps:
- Impute numeric/categorical
- Add ratio features
- Add 8 geo-cluster indicator columns
- Scale numeric
- One-hot encode `ocean_proximity` (drop='first')


In [5]:
import importlib.util, sys

pp_path = ROOT / "analysis" / "preprocessing_pipeline.py"
assert pp_path.exists(), f"Missing {pp_path} — create it first."

spec = importlib.util.spec_from_file_location("pp", str(pp_path))
pp = importlib.util.module_from_spec(spec)
sys.modules["pp"] = pp
spec.loader.exec_module(pp)

pipeline = pp.build_pipeline()
pipeline


0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,eps,1e-09

0,1,2
,n_clusters,8
,random_state,42

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [6]:
TARGET = "median_house_value"
assert TARGET in df_train.columns, f"Target column `{TARGET}` not found in training file."

X = df_train.drop(columns=[TARGET])
y = df_train[TARGET].copy()

pipeline.fit(X, y)
X_proc = pipeline.transform(X)

# --- Build feature names manually ---
num_base = list(pp.NUM_FEATS)  # 8 numeric
num_ratios = [
    "rooms_per_household",
    "bedrooms_per_room",
    "population_per_household",
    "rooms_per_bedroom",
]  # +4
num_clusters = [f"geo_cluster_{k}" for k in range(8)]  # +8
num_names = num_base + num_ratios + num_clusters       # 20 total

# categorical feature names from OneHotEncoder
ohe = pipeline.named_transformers_["cat"].named_steps["ohe"]
cat_names = ohe.get_feature_names_out(pp.CAT_FEAT).tolist()  # typically 4

# prefix for clarity
feat_names = [f"num__{n}" for n in num_names] + [f"cat__{n}" for n in cat_names]

# --- Make DataFrame ---
X_proc_df = pd.DataFrame(X_proc, columns=feat_names)
print("Processed shape:", X_proc_df.shape)  # should be (rows, 24)
X_proc_df.head()


Processed shape: (16512, 24)


Unnamed: 0,num__longitude,num__latitude,num__housing_median_age,num__total_rooms,num__total_bedrooms,num__population,num__households,num__median_income,num__rooms_per_household,num__bedrooms_per_room,...,num__geo_cluster_2,num__geo_cluster_3,num__geo_cluster_4,num__geo_cluster_5,num__geo_cluster_6,num__geo_cluster_7,cat__ocean_proximity_INLAND,cat__ocean_proximity_ISLAND,cat__ocean_proximity_NEAR BAY,cat__ocean_proximity_NEAR OCEAN
0,-1.423037,1.013606,1.861119,0.311912,1.368167,0.13746,1.394812,-0.936491,-0.866027,1.846624,...,-0.297188,-0.337668,-0.223265,-0.234835,-0.153685,2.278426,0.0,0.0,1.0,0.0
1,0.596394,-0.702103,0.90763,-0.30862,-0.435925,-0.693771,-0.373485,1.171942,0.02455,-0.508121,...,-0.297188,-0.337668,-0.223265,-0.234835,-0.153685,-0.438899,0.0,0.0,0.0,0.0
2,-1.203098,1.276119,0.351428,-0.71224,-0.760709,-0.788768,-0.775727,-0.759789,-0.041193,-0.202155,...,-0.297188,-0.337668,-0.223265,-0.234835,-0.153685,2.278426,1.0,0.0,0.0,0.0
3,1.231216,-0.884924,-0.919891,0.702262,0.742306,0.383175,0.731375,-0.850281,-0.034858,-0.149006,...,-0.297188,2.961489,-0.223265,-0.234835,-0.153685,-0.438899,1.0,0.0,0.0,0.0
4,0.711362,-0.875549,0.5898,0.790125,1.595753,0.444376,1.755263,-0.180365,-0.666554,0.963208,...,-0.297188,-0.337668,-0.223265,-0.234835,-0.153685,-0.438899,0.0,0.0,0.0,1.0


## Save Processed Training Set (24 features)

Append the target and write to `/data/train/housing_train_processed.csv`.


In [7]:
processed = X_proc_df.copy()
processed[TARGET] = y.values

out_proc = TRAIN / "housing_train_processed.csv"
processed.to_csv(out_proc, index=False)

print("Saved processed training set →", out_proc.resolve())
processed.shape


Saved processed training set → /home/rolljake/cmse492_aml/ca_housing_project/data/train/housing_train_processed.csv


(16512, 25)