In [6]:
#Load Data

from pathlib import Path
import pandas as pd
import tarfile
import urllib.request

def load_housing_data():
    tarball_path = Path("ca_housing_project/data/raw/housing.tgz")
    extract_path = Path("ca_housing_project/data/raw")
    csv_path = extract_path / "housing.csv"

    if not tarball_path.is_file():
        extract_path.mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path=extract_path)

    return pd.read_csv(csv_path)

housing = load_housing_data()
#housing.info()




In [None]:
#Data Type Analysis

#Info tells us what data we have to work with
housing.info()

#Initial plots of raw data
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(12, 8))
plt.show()



#Housing price map
housing.plot(kind="scatter", x="longitude", y="latitude", grid=True,
 s=housing["population"] / 100, label="population",
 c="median_house_value", cmap="jet", colorbar=True,
 legend=True, sharex=False, figsize=(10, 7))
plt.show()

#Correlation Matrix
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

#Scatter Matrix
from pandas.plotting import scatter_matrix
 attributes = ["median_house_value", "median_income", "total_rooms",
 "housing_median_age"]
 scatter_matrix(housing[attributes], figsize=(12, 8))
plt.show()


#Median Income scatter matrix
housing.plot(kind="scatter", x="median_income", y="median_house_value",
 alpha=0.1, grid=True)
plt.show()


In [None]:
# Stratified Tran/Test Splitting

from sklearn.model_selection import train_test_split

# Add missing features
median = housing["total_bedrooms"].median()  
housing["total_bedrooms"].fillna(median)

# Add in "income category" feature
housing["income_cat"] = pd.cut(housing["median_income"],
                            bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                            labels=[1, 2, 3, 4, 5])

#Add in the new features
housing["rooms_per_house"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_ratio"] = housing["total_bedrooms"] / housing["total_rooms"]
housing["people_per_house"] = housing["population"] / housing["households"]



# Create 1 split set.
strat_train_set, strat_test_set = train_test_split(
    housing, test_size=0.2, stratify=housing["income_cat"], random_state=42)

# remove income category feature
for set_ in (strat_train_set, strat_test_set):
 set_.drop("income_cat", axis=1, inplace=True)


#rename datasets
housing_test = strat_test_set
housing_train = strat_train_set


"""
# Renames the datasets and splits in X and Y
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()


# Save a checkpoint
housing.to_csv(Path("datasets/housing/Training_set_nolabels.csv"), index=False)
housing_labels.to_csv(Path("datasets/housing/Training_set_labels.csv"), index=False)

"""

housing_test.to_csv(Path("ca_housing_project/Data/Test/housing_test.csv"))
housing_train.to_csv(Path("ca_housing_project/Data/Train/housing_train.csv"))


#housing_train.info