In [1]:
from sklearn.datasets import fetch_california_housing

In [2]:
data = fetch_california_housing()

In [8]:
print(data['DESCR'])

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [9]:
X = data['data']
y = data['target']

In [10]:
from sklearn.preprocessing import StandardScaler
import numpy as np
scaler = StandardScaler()
scaler.fit(X)


StandardScaler(copy=True, with_mean=True, with_std=True)

In [11]:
Xt = scaler.transform(X)

In [12]:
stats = np.vstack((X.mean(axis=0),X.var(axis=0),Xt.mean(axis=0),Xt.var(axis=0))).T

In [14]:
import pandas as pd
feature_names = data['feature_names']
columns = ['unscaled mean', 'unscaled variance', 'scaled mean', 'scaled variance']
df = pd.DataFrame(stats, index=feature_names, columns=columns)

In [15]:
df

Unnamed: 0,unscaled mean,unscaled variance,scaled mean,scaled variance
MedInc,3.870671,3.609148,6.6097e-17,1.0
HouseAge,28.639486,158.3886,5.508083e-18,1.0
AveRooms,5.429,6.121236,6.6097e-17,1.0
AveBedrms,1.096675,0.2245806,-1.060306e-16,1.0
Population,1425.476744,1282408.0,-1.101617e-17,1.0
AveOccup,3.070655,107.8648,3.442552e-18,1.0
Latitude,35.631861,4.562072,-1.079584e-15,1.0
Longitude,-119.569704,4.013945,-8.526513e-15,1.0


In [16]:
from sklearn.compose import ColumnTransformer
col_transformer = ColumnTransformer(
                  remainder = 'passthrough',
                  transformers = [
                      ('scaler',StandardScaler(), slice(0,6))
                  ])

In [17]:
col_transformer.fit(X)

ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('scaler',
                                 StandardScaler(copy=True, with_mean=True,
                                                with_std=True),
                                 slice(0, 6, None))],
                  verbose=False)

In [18]:
Xt = col_transformer.transform(X)

In [19]:
print('MedInc mean before transformation? ', X.mean(axis=0)[0])

MedInc mean before transformation?  3.8706710029069766


In [20]:
print("MedInc mean after transformation? ", Xt.mean(axis=0)[0])

MedInc mean after transformation?  6.609699867535816e-17


In [21]:
print('Longitude mean before transformation? ', X.mean(axis=0)[-1])

Longitude mean before transformation?  -119.56970445736432


In [22]:
print('Longitude mean after transformation? ', Xt.mean(axis=0)[-1])

Longitude mean after transformation?  -119.56970445736432


In [23]:
print('Latitude mean before transformation? ', X.mean(axis=0)[-2])

Latitude mean before transformation?  35.63186143410853


In [24]:
print('Latitude mean after transformation? ', Xt.mean(axis=0)[-2])

Latitude mean after transformation?  35.63186143410853
