In [1]:
import numpy as np
from sklearn import preprocessing

In [2]:
#Sample data
data =  np.array([[ 1., -1.,  2.], [ 2.,  0.,  0.], [ 0.,  1., -1.]])
print(data)

[[ 1. -1.  2.]
 [ 2.  0.  0.]
 [ 0.  1. -1.]]


In [3]:
#Scaling
data_scaled = preprocessing.scale(data)
print(data_scaled)


[[ 0.         -1.22474487  1.33630621]
 [ 1.22474487  0.         -0.26726124]
 [-1.22474487  1.22474487 -1.06904497]]


In [4]:
#Creating scaler instance
scaler = preprocessing.StandardScaler().fit(data)

In [5]:
print(scaler)
print(scaler.mean_ )                                     
print(scaler.scale_ )                                      

scaler.transform(data) 

StandardScaler(copy=True, with_mean=True, with_std=True)
[1.         0.         0.33333333]
[0.81649658 0.81649658 1.24721913]


array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [6]:

scaler.transform([[-1.,  1., 0.]]) #New element

# It is possible to disable either centering or scaling by either passing with_mean=False or with_std=False to the constructor of StandardScaler.

array([[-2.44948974,  1.22474487, -0.26726124]])

In [7]:


#Scaling features to a range
min_max_scaler = preprocessing.MinMaxScaler()
data_train_minmax = min_max_scaler.fit_transform(data)
data_train_minmax

array([[0.5       , 0.        , 1.        ],
       [1.        , 0.5       , 0.33333333],
       [0.        , 1.        , 0.        ]])

In [8]:

data_test = np.array([[ -3., -1.,  4.]]) #New instance
data_test_minmax = min_max_scaler.transform(data_test)
data_test_minmax

array([[-1.5       ,  0.        ,  1.66666667]])

In [9]:
# MaxAbsScaler works in a very similar fashion, but scales in a way that the training data lies within the range [-1, 1] by dividing through the largest maximum value in each feature. It is meant for data that is already centered at zero or sparse data. 

#MaxAbsScaler 
max_abs_scaler = preprocessing.MaxAbsScaler()
data_train_maxabs = max_abs_scaler.fit_transform(data)
print(data_train_maxabs )               # doctest +NORMALIZE_WHITESPACE^
data_test = np.array([[ -3., -1.,  4.]])
data_test_maxabs = max_abs_scaler.transform(data_test)
print(data_test_maxabs  )               

max_abs_scaler.scale_         

[[ 0.5 -1.   1. ]
 [ 1.   0.   0. ]
 [ 0.   1.  -0.5]]
[[-1.5 -1.   2. ]]


array([2., 1., 2.])

In [10]:

#Normalization (Dot product or Matrices)
X = [[ 1., -1.,  2.], [ 2.,  0.,  0.],[ 0.,  1., -1.]]
X_normalized = preprocessing.normalize(X, norm='l2')

X_normalized 

array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])

In [11]:
from sklearn.preprocessing import Normalizer
normalizer = preprocessing.Normalizer().fit(X)  # fit does nothing
normalizer

Normalizer(copy=True, norm='l2')

In [12]:
normalizer.transform(X)

array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])

In [13]:
#Feature binarization
X = [[ 1., -1.,  2.],[ 2.,  0.,  0.],[ 0.,  1., -1.]]
print(X)



binarizer = preprocessing.Binarizer().fit(X)  # fit does nothing
print(binarizer)

binarizer.transform(X)

[[1.0, -1.0, 2.0], [2.0, 0.0, 0.0], [0.0, 1.0, -1.0]]
Binarizer(copy=True, threshold=0.0)


array([[1., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [16]:
from math import sqrt
x2norm=sqrt(1*1+(-1)*(-1)+2*2)
x2norm

2.449489742783178

In [17]:
import numpy
X = numpy.array([[ 1., -1.,  2.]])
X/x2norm

array([[ 0.40824829, -0.40824829,  0.81649658]])

In [18]:

#Adjust Threshold
binarizer = preprocessing.Binarizer(threshold=1.1)
binarizer.transform(X)

array([[0., 0., 1.]])

In [3]:
#Imputation of missing values
import numpy as np
from sklearn.preprocessing import SimpleImputer
imp = SimpleImputer(missing_values='NaN', strategy='mean', axis=0)
print(imp.fit([[1, 2], [np.nan, 3], [7, 6]]))

ImportError: cannot import name 'SimpleImputer' from 'sklearn.preprocessing' (C:\Users\welcome\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\preprocessing\__init__.py)

In [4]:
Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)
X = [[np.nan, 2], [6, np.nan], [7, 6]]
print(imp.transform(X))    

[[4.         2.        ]
 [6.         3.66666667]
 [7.         6.        ]]




In [5]:

#Custom Transformation
import numpy as np
from sklearn.preprocessing import FunctionTransformer
transformer = FunctionTransformer(np.log1p)
X = np.array([[0, 1], [2, 3]])
transformer.transform(X)




array([[0.        , 0.69314718],
       [1.09861229, 1.38629436]])

In [22]:
import pandas as pd

df = pd.DataFrame({'country': ['russia', 'germany', 'australia','korea','germany']})
df

Unnamed: 0,country
0,russia
1,germany
2,australia
3,korea
4,germany


In [23]:
pd.get_dummies(df,prefix=['country'])

Unnamed: 0,country_australia,country_germany,country_korea,country_russia
0,0,0,0,1
1,0,1,0,0
2,1,0,0,0
3,0,0,1,0
4,0,1,0,0


In [24]:
pd.get_dummies(df,prefix=['country'], drop_first=True)

Unnamed: 0,country_germany,country_korea,country_russia
0,0,0,1
1,1,0,0
2,0,0,0
3,0,1,0
4,1,0,0


In [25]:
# say you want a column for "japan" too (it'll be always zero, of course)
df["country"] = df["country"].astype('category',categories=["australia","germany","korea","russia","japan"])

# now call .get_dummies() as usual
pd.get_dummies(df,prefix=['country'])

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,country_australia,country_germany,country_korea,country_russia,country_japan
0,0,0,0,1,0
1,0,1,0,0,0
2,1,0,0,0,0
3,0,0,1,0,0
4,0,1,0,0,0
