# Initial Data Analysis (IDA)


In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import urllib.request
import tarfile
from sklearn.model_selection import StratifiedShuffleSplit
from shutil import copyfile

def load_housing_data():
    tarball_path = Path("datasets/housing.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
    with tarfile.open(tarball_path) as housing_tarball:
        housing_tarball.extractall(path="datasets")
    return pd.read_csv(Path("datasets/housing/housing.csv"))

In [2]:
# Load dataset (download if missing)
housing = load_housing_data()

# Ensure project folders exist
from pathlib import Path
import pandas as pd
import numpy as np
import urllib.request, tarfile
from sklearn.model_selection import StratifiedShuffleSplit
from shutil import copyfile

# point ROOT to project root (parent of /analysis)
ROOT = Path("..")
RAW   = ROOT / "data" / "raw"
TRAIN = ROOT / "data" / "train"
TEST  = ROOT / "data" / "test"

for p in [RAW, TRAIN, TEST]:
    p.mkdir(parents=True, exist_ok=True)

# Save untouched copy to /data/raw (grading requirement)
copyfile("datasets/housing/housing.csv", RAW / "housing.csv")

housing.head()

  housing_tarball.extractall(path="datasets")


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
housing.info()
housing.describe(include="all")
housing.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [4]:
# Create income category for stratified sampling
housing["income_cat"] = pd.cut(
    housing["median_income"],
    bins=[0., 1.5, 3., 4.5, 6., np.inf],
    labels=[1,2,3,4,5]
)

# Split
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_idx, test_idx in splitter.split(housing, housing["income_cat"]):
    train = housing.loc[train_idx].drop(columns=["income_cat"]).reset_index(drop=True)
    test  = housing.loc[test_idx].drop(columns=["income_cat"]).reset_index(drop=True)

print("Train shape:", train.shape)
print("Test shape:", test.shape)


Train shape: (16512, 10)
Test shape: (4128, 10)


In [5]:
# Add ratio features to hit 13 total cols (12 features + target)
for _d in (train, test):
    _d["rooms_per_household"] = _d["total_rooms"] / _d["households"]
    _d["bedrooms_per_room"]   = _d["total_bedrooms"] / _d["total_rooms"]

cols_13 = [
    "longitude","latitude","housing_median_age","total_rooms","total_bedrooms",
    "population","households","median_income","ocean_proximity",
    "median_house_value","rooms_per_household","bedrooms_per_room"
]

train_13 = train[cols_13].copy()
test_13  = test[cols_13].copy()

train_13.head()


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value,rooms_per_household,bedrooms_per_room
0,-122.42,37.8,52.0,3321.0,1115.0,1576.0,1034.0,2.0987,NEAR BAY,458300.0,3.211799,0.335742
1,-118.38,34.14,40.0,1965.0,354.0,666.0,357.0,6.0876,<1H OCEAN,483800.0,5.504202,0.180153
2,-121.98,38.36,33.0,1083.0,217.0,562.0,203.0,2.433,INLAND,101700.0,5.334975,0.200369
3,-117.11,33.75,17.0,4174.0,851.0,1845.0,780.0,2.2618,INLAND,96100.0,5.351282,0.203881
4,-118.15,33.77,36.0,4366.0,1211.0,1912.0,1172.0,3.5292,NEAR OCEAN,361800.0,3.725256,0.277371


In [6]:
train_out = TRAIN / "housing_train.csv"
test_out  = TEST  / "housing_test.csv"

train_13.to_csv(train_out, index=False)
test_13.to_csv(test_out, index=False)

print("Saved:", train_out)
print("Saved:", test_out)

Saved: ../data/train/housing_train.csv
Saved: ../data/test/housing_test.csv
