In [None]:
%matplotlib inline
import re
import os
import zipfile
from zlib import crc32

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from requests import get as req_get

In [None]:
# get dataset from statlib
if not os.path.exists('data'):
    os.mkdir('data')
if not os.path.exists('data/cadata.txt') and not os.path.exists('data/housing.csv'):
    if not os.path.exists('data/data.zip'):
        load_data = req_get('http://lib.stat.cmu.edu/datasets/houses.zip')
        with open('data/data.zip', 'wb') as f:
            f.write(load_data.content)
    zip_data = zipfile.ZipFile('data/data.zip', 'r')
    zip_data.extractall('data/')
    os.remove('data/data.zip')

In [None]:
# read origin text file then form a formal csv file, or read csv file preprocessed 
if os.path.exists('data/housing.csv'):
    df = pd.read_csv('data/housing.csv')
else:
    with open('data/cadata.txt', 'r', encoding='Windows-1252') as fe:
        fe.seek(1575)
        data = fe.read()
    data = data.splitlines()
    for index, sub_data in enumerate(data):
        data[index] = sub_data.strip()
        data[index] = list(filter(lambda x: x != '', re.split('\s|\s\s', data[index])))
        data[index] = list(map(lambda x: float(x), data[index]))
    df = pd.DataFrame(data, columns=['median house value', 'median income', 'housing median age', 'total rooms', 'total bedrooms', 'population', 'households', 'latitude', 'longitude'])
    df.to_csv('data/housing.csv', index=False)
    os.remove('data/cadata.txt')

df.head()

In [None]:
# read dataframe info
df.info()

In [None]:
df.describe()

In [None]:
df.hist(bins=50, figsize=(20, 15))
plt.show()

In [None]:
# split dataset by trainset and testset
# 'true' random sampling
df["id"] = df["longitude"] * 1000 + df["latitude"]

np.random.seed(42)
def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2 ** 32 # crc-32 here used as hash func

def split_train_test(dataset, test_ratio, id_column):
    # shuffled_indices = np.random.permutation(len(dataset))
    # test_set_size = int(len(dataset) * test_radio)
    # test_indices = shuffled_indices[:test_set_size]
    # train_indices = shuffled_indices[test_set_size:]
    # return dataset.iloc[train_indices], dataset.iloc[test_indices]
    ids = dataset[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return dataset[~in_test_set], dataset[in_test_set]

train_set, test_set = split_train_test(df, 0.2, "id") # instead of train_test_split in scikit-learn
# train_set, test_set = sklearn.model_selection.train_test_split(df, test_size=0.2, random_state=42)
train_set

In [None]:
# stratified sampling
df["income cat"] = pd.cut(
    df["median income"],
    bins=[0., 1.5, 3., 4.5, 6., np.inf],
    labels=[1, 2, 3, 4, 5]
)
df["income cat"].hist()

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df, df["income cat"]):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]
strat_test_set["income cat"].value_counts() / len(strat_test_set)

In [None]:
# restore origin dataset
for _set in (strat_train_set, strat_test_set):
    _set.drop("income cat", axis=1, inplace=True)

In [None]:
# copy strat train set then check the location of it
housing = strat_train_set.copy()
housing.plot(
    kind="scatter", x="longitude", y="latitude", alpha=0.4, c="median house value", cmap=plt.get_cmap("jet"), colorbar=True,
    s=housing["population"] / 100, label="population", figsize=(10, 7)
)
plt.legend()

In [None]:
# measuring correlation
corr_matrix = housing.drop("id", axis=1).corr()
corr_matrix

In [None]:
# print scatter to find the correlation of following attributes
attributes = [ 'median house value', 'median income', 'total rooms', 'housing median age', ]
pd.plotting.scatter_matrix(housing[attributes], figsize=(12, 8))

In [None]:
housing.plot(kind="scatter", x="median income", y="median house value", alpha=0.1)

In [None]:
# add some combined attributes then find correlation between them and house value
housing["rooms per household"] = housing["total rooms"] / housing["households"]
housing["bedrooms per room"] = housing["total bedrooms"] / housing["total rooms"]
housing["population per household"] = housing["population"] / housing["households"]

corr_matrix = housing.drop("id", axis=1).corr()
corr_matrix["median house value"].sort_values(ascending=False)

In [None]:
# now we prepare the data for trainning
# fill null with median value
housing = strat_train_set.drop(["median house value", "id"], axis=1)
housing_labels = strat_train_set["median house value"].copy()
imputer = SimpleImputer(strategy="median")
imputer.fit(housing)
X = imputer.transform(housing)
# above two statements can be instead of following
# X = imputer.fit_transform(housing)
housing

In [None]:
# custom translator
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6 # the index of above attribute on a sample

class CombinedAttributeAdder(BaseEstimator, TransformerMixin):
    
    def __init__(self, add_bedrooms_per_room = True) -> None:
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

# add extra attributes into train set
attr_adder = CombinedAttributeAdder(add_bedrooms_per_room=False)
housing_extra_attributes = attr_adder.transform(housing.values)
housing_extra_attributes