In [None]:
from pathlib import Path
import os

import cartopy.crs as ccrs
import cartopy.io.shapereader as shpreader
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
%matplotlib inline

In [None]:
SEED = 51

BOOK_FILES_DATASETS_BASE = Path(os.environ.get('BOOK_FILES_DATASETS_BASE'))
HOUSING_DATA_PATH = BOOK_FILES_DATASETS_BASE/'housing/housing.csv'

In [None]:
def load_housing_data(housing_data_path=HOUSING_DATA_PATH):
    return pd.read_csv(housing_data_path)

In [None]:
df_housing = load_housing_data()

In [None]:
df_housing.head()

In [None]:
df_housing.info()

In [None]:
df_housing.ocean_proximity.value_counts()

In [None]:
df_housing.describe()

In [None]:
df_housing.hist(bins=50, figsize=(20,15), grid=False)
plt.show()

In [None]:
df_housing['income_cat'] = pd.cut(
    df_housing.median_income,
    bins=[0, 1.5, 3, 4.5, 6, np.inf],
    labels=[1, 2, 3, 4, 5],
)

In [None]:
df_housing.income_cat.hist(grid=False)
plt.show()

In [None]:
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=SEED)
for train_indices, test_indices in splitter.split(df_housing, df_housing.income_cat):
    df_housing_train = df_housing.loc[train_indices]
    df_housing_test = df_housing.loc[test_indices]

In [None]:
def get_income_cat_distribution(df):
    return df.income_cat.value_counts() / len(df.index)


pd.DataFrame({
    'raw': get_income_cat_distribution(df_housing),
    'train': get_income_cat_distribution(df_housing_train),
    'test': get_income_cat_distribution(df_housing_test),
})

In [None]:
df_housing_train = df_housing_train.drop(columns=['income_cat'])
df_housing_test = df_housing_test.drop(columns=['income_cat'])

In [None]:
df_housing_train_copy = df_housing_train.copy()  # For messing around with

In [None]:
fig = plt.figure(figsize=(10,10))

ax = fig.add_axes([0,0,1,1], projection=ccrs.LambertConformal(), frameon=False)

ax.set_extent([-125, -113, 32, 43])
ax.add_feature(cfeature.OCEAN)
ax.add_feature(cfeature.LAND)
ax.add_feature(cfeature.RIVERS)
ax.add_feature(cfeature.LAKES)

shapename = 'admin_1_states_provinces'
states_shp = shpreader.natural_earth(
    resolution='50m',
    category='cultural',
    name=shapename,
)
reader = shpreader.Reader(states_shp)
states = reader.records()

for state in states:
    if state.attributes['name'] == 'California':
        ax.add_geometries(state.geometry, ccrs.PlateCarree(), color='w', alpha=0.5)
        
ax.scatter(
    df_housing_train_copy.longitude,
    df_housing_train_copy.latitude,
    c=df_housing_train_copy.median_house_value,
    alpha=0.3,
    transform=ccrs.PlateCarree(),
    zorder=2,
)