In [1]:
import numpy as np
import pandas as pd

In [2]:
x = np.array([[1.,-1.,2.],
            [2.,0.,0.,],
            [0.,1.,-1.]])

In [3]:
x

array([[ 1., -1.,  2.],
       [ 2.,  0.,  0.],
       [ 0.,  1., -1.]])

In [7]:
y = np.arange(18).reshape(3,3,2)

In [8]:
y

array([[[ 0,  1],
        [ 2,  3],
        [ 4,  5]],

       [[ 6,  7],
        [ 8,  9],
        [10, 11]],

       [[12, 13],
        [14, 15],
        [16, 17]]])

In [9]:
from sklearn import preprocessing

In [10]:
x_scaled = preprocessing.scale(x)

In [11]:
x_scaled

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [12]:
x_scaled.mean(axis=0)
x_scaled.std(axis = 0)


array([ 1.,  1.,  1.])

In [13]:
x_scaled.mean(axis=0)

array([ 0.,  0.,  0.])

In [14]:
x_scaled.std(axis = 0)


array([ 1.,  1.,  1.])

In [16]:
#using preprocessing standard scaler by applying transformer API
scaler = preprocessing.StandardScaler().fit(x)

In [17]:
scaler

StandardScaler(copy=True, with_mean=True, with_std=True)

In [18]:
scaler.mean_

array([ 1.        ,  0.        ,  0.33333333])

In [19]:
scaler.scale_

array([ 0.81649658,  0.81649658,  1.24721913])

In [20]:
scaler.std_



array([ 0.81649658,  0.81649658,  1.24721913])

In [22]:
z = scaler.transform(x)

In [23]:
z

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [24]:
z.mean(axis = 0)

array([ 0.,  0.,  0.])

In [25]:
z.std(axis = 0)

array([ 1.,  1.,  1.])

In [26]:
#An alternative is to scale the data/features to a range
min_max_scaler = preprocessing.MinMaxScaler()

In [27]:
min_max_scaler

MinMaxScaler(copy=True, feature_range=(0, 1))

In [28]:
x_train_minmax = min_max_scaler.fit_transform(x)

In [29]:
x_train_minmax

array([[ 0.5       ,  0.        ,  1.        ],
       [ 1.        ,  0.5       ,  0.33333333],
       [ 0.        ,  1.        ,  0.        ]])

In [32]:
x_train_minmax.min

<function ndarray.min>

In [33]:
min_max_scaler.min_

array([ 0.        ,  0.5       ,  0.33333333])

In [34]:
min_max_scaler.scale_

array([ 0.5       ,  0.5       ,  0.33333333])

In [35]:
#MaxAbsScaler causes training data to be in a range between [-1,1]
maxabs_scale = preprocessing.MaxAbsScaler()

In [36]:
x_train_maxabs = maxabs_scale.fit_transform(x)

In [37]:
x_train_maxabs

array([[ 0.5, -1. ,  1. ],
       [ 1. ,  0. ,  0. ],
       [ 0. ,  1. , -0.5]])

In [41]:
#applyint the same on a test data set
x_test = np.array([[-3,-1,4.],[3,6,8]])

In [42]:
x_test_maxabs = maxabs_scale.fit_transform(x_test)


In [43]:
x_test_maxabs

array([[-1.        , -0.16666667,  0.5       ],
       [ 1.        ,  1.        ,  1.        ]])

In [3]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn import metrics


In [5]:
iris = load_iris()



In [6]:
x_train,x_test,y_train,y_test = train_test_split(iris.data, iris.target, test_size = 0.4, random_state = 0)

In [7]:
scaler = preprocessing.StandardScaler().fit(x_train)
x_train_transformed = scaler.transform(x_train)

In [9]:
clf = SVC(C=1).fit(x_train_transformed,y_train)

In [10]:
x_test_transformed = scaler.transform(x_test)

In [11]:
clf.score(x_test_transformed,y_test)

0.93333333333333335

In [13]:
y_pred = clf.predict(x_test_transformed)

In [14]:
y_pred

array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1, 0,
       0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 2, 1, 1, 2, 0, 2, 0, 0, 1,
       2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 2, 1, 2])

In [15]:
metrics.accuracy_score(y_test,y_pred)

0.93333333333333335

In [3]:
# copy is equal to true removes the linear correlation between features
# normalization is the process of scaling individual samples to have a unit norm.Assumption used here is based on the 
#vector space model
x

array([[ 1., -1.,  2.],
       [ 2.,  0.,  0.],
       [ 0.,  1., -1.]])

In [9]:
from sklearn import preprocessing

In [12]:
x_normalized = preprocessing.normalize(x, norm='l2')

In [13]:
x_normalized

array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])

In [14]:
#still we can use a utility class Normalizer that implements the same operation using transformer API(even though fit method is useless in this case)
normalizer = preprocessing.Normalizer().fit(x)#fit does nothing
normalizer

Normalizer(copy=True, norm='l2')

In [15]:
normalizer.transform(x)

array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])

In [18]:
#binarizer..Feature binarization is the process of thresholding numerical features to get boolean values
binarizer = preprocessing.Binarizer().fit(x)  # fit does nothing
binarizer

Binarizer(copy=True, threshold=0.0)

In [19]:
binarizer.transform(x)

array([[ 1.,  0.,  1.],
       [ 1.,  0.,  0.],
       [ 0.,  1.,  0.]])

In [21]:
#It is possible to adjust the threshold of the binarizer:
binarizer = preprocessing.Binarizer(threshold=1.1)
binarizer.transform(x)

array([[ 0.,  0.,  1.],
       [ 1.,  0.,  0.],
       [ 0.,  0.,  0.]])

In [22]:
#encoding categorical features...ie from categorical text to  numerical daya ie ["male","Female] to [0,1].To convert this to
#iterger representation so that scikit learn estimators may use them is to use oneHotEncoder.This estimator transforms each 
#categorical feature with m possible values into m binary features, with only one active.
enc = preprocessing.OneHotEncoder()
enc.fit([[0,0,3],[1,1,0],[0,2,1],[1,0,2]])

OneHotEncoder(categorical_features='all', dtype=<class 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [23]:
enc.transform([[0, 1, 3]]).toarray()

array([[ 1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.]])

In [24]:
#Note that, if there is a possibilty that the training data might have missing categorical features, 
#one has to explicitly set n_values.For example,
enc = preprocessing.OneHotEncoder(n_values=[2, 3, 4])
# Note that there are missing categorical values for the 2nd and 3rd  features
enc.fit([[1, 2, 3], [0, 2, 0]])

OneHotEncoder(categorical_features='all', dtype=<class 'numpy.float64'>,
       handle_unknown='error', n_values=[2, 3, 4], sparse=True)

In [25]:
enc.transform([[1, 0, 0]]).toarray()

array([[ 0.,  1.,  1.,  0.,  0.,  1.,  0.,  0.,  0.]])

In [26]:
#imputation of missing values
#For various reasons, many real world datasets contain missing values, often encoded as blanks, NaNs or other placeholders.
#Such datasets however are incompatible with scikit-learn estimators which assume that all values in an array are numerical,
#and that all have and hold meaning
#The Imputer class provides basic strategies for imputing missing values, either using the mean, 
#the median or the most frequent value of the row or column in which the missing values are located. 
#This class also allows for different missing values encodings
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit([[1, 2], [np.nan, 3], [7, 6]])

Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)

In [28]:
X = [[np.nan, 2], [6, np.nan], [7, 6]]
print(imp.transform(X))

[[ 4.          2.        ]
 [ 6.          3.66666667]
 [ 7.          6.        ]]


In [2]:
#polynomial features
#Often it’s useful to add complexity to the model by considering nonlinear features of the input data
#The polynomial features, which can get features’ high-order and interaction .
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
X = np.arange(6).reshape(3, 2)

In [4]:
poly = PolynomialFeatures(2)
poly

PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)

In [5]:
#In some cases, only interaction terms among features are required, and it can be gotten with the setting interaction_only=True:
poly = PolynomialFeatures(degree=3, interaction_only=True)
poly.fit_transform(X)

array([[  1.,   0.,   1.,   0.],
       [  1.,   2.,   3.,   6.],
       [  1.,   4.,   5.,  20.]])