In [117]:
import pandas as pd

In [118]:
train_set = pd.read_csv("data/processed/train_set.csv")
test_set = pd.read_csv("data/processed/test_set.csv")

In [119]:
X_train = train_set.drop("median_house_value", axis=1)
y_train = train_set["median_house_value"].copy()

X_test = test_set.drop("median_house_value", axis=1)
y_test = test_set["median_house_value"].copy()



In [120]:
num_attribs = X_train.drop("ocean_proximity", axis=1).columns
cat_attribs = ["ocean_proximity"]

In [121]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

col_names = ["total_rooms", "total_bedrooms", "population", "households"]
rooms_ix, bedrooms_ix, population_ix, households_ix = [
    X_train.columns.get_loc(c) for c in col_names]

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True): 
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]


In [122]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

In [123]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])


In [124]:
X_train_set = full_pipeline.fit_transform(X_train)
X_test_set = full_pipeline.transform(X_test)

In [125]:

np.save("data/processed/X_train.npy", X_train_set)
y_train.to_csv("data/processed/y_train.csv", index=False)

np.save("data/processed/X_test.npy", X_test_set)
y_test.to_csv("data/processed/y_test.csv", index=False)
