In [1]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

Transformers
==============

Unsupervised transformations for preprocessing
--------------------------------------------------

In [2]:
from sklearn.datasets import load_boston
boston = load_boston()
boston.keys()

dict_keys(['feature_names', 'target', 'DESCR', 'data'])

In [3]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target)

In [4]:
np.set_printoptions(suppress=True)
print(X_train)

[[   0.2909     0.        21.89    ...,   21.2      388.08      24.16   ]
 [   0.03041    0.         5.19    ...,   20.2      394.81      10.56   ]
 [   0.13587    0.        10.59    ...,   18.6      381.32      14.66   ]
 ..., 
 [   0.03502   80.         4.95    ...,   19.2      396.9        3.33   ]
 [   0.15086    0.        27.74    ...,   20.1      395.09      18.06   ]
 [   1.25179    0.         8.14    ...,   21.       376.57      21.02   ]]


In [5]:
print("mean : %s " % X_train.mean(axis=0))
print("standard deviation : %s " % X_train.std(axis=0))

mean : [   3.34897889   11.68601583   11.09968338    0.06860158    0.55088047
    6.30231662   67.81609499    3.85240739    9.54353562  408.46437995
   18.45883905  360.52184697   12.34614776] 
standard deviation : [   7.93206554   23.55777358    6.96186328    0.25277541    0.11399048
    0.69293459   28.67474926    2.11166078    8.77151056  169.97431336
    2.12426482   84.94790153    7.01054102] 


In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
scaler.fit(X_train)

In [None]:
X_scaled = scaler.transform(X_train)

In [None]:
print(X.shape)
print(X_scaled.shape)

In [None]:
print("mean : %s " % X_scaled.mean(axis=0))
print("standard deviation : %s " % X_scaled.std(axis=0))

In [None]:
X_scaled_test = scaler.transform(X_test)

Principal Component Analysis
=============================

In [None]:
rnd = np.random.RandomState(42)
X_blob = np.dot(rnd.normal(size=(100, 2)), rnd.normal(size=(2, 2))) + rnd.normal(size=2)
plt.scatter(X_blob[:, 0], X_blob[:, 1])
plt.xlabel("feature 1")
plt.ylabel("feature 2")

In [None]:
from sklearn.decomposition import PCA
pca = PCA()

In [None]:
pca.fit(X_blob)

In [None]:
X_pca = pca.transform(X_blob)

plt.scatter(X_pca[:, 0], X_pca[:, 1])
plt.xlabel("first principal component")
plt.ylabel("second principal component")

PCA for dimensionality Reduction
---------------------------------

In [None]:
from sklearn.datasets import load_digits

digits = load_digits(n_class=5)
X, y = digits.data, digits.target
print(X.shape)

In [None]:
pca = PCA(n_components=2)
pca.fit(X)

In [None]:
X_reduced = pca.transform(X)
print("Reduced dataset shape: %s" % (X_reduced.shape, ))

In [None]:
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y)