## Rescaling a feature

In [2]:
import numpy as np
from sklearn import preprocessing

# create a feature
feature = np.array([
                    [-500.5],
                    [-100.1],
                    [0],
                    [100.11],
                    [900.9]
                  ])

# create scaler
minmax_scaler =	preprocessing.MinMaxScaler(feature_range=(0,1))

# scale feature
scaled_feature = minmax_scaler.fit_transform(feature)

scaled_feature

array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857856],
       [1.        ]])

## Standardizing a Feature

In [3]:
from sklearn import preprocessing

# create a feature
feature = np.array([
                  [-1000.1],
                  [-200.2],
                  [500.5],
                  [600.6] ,
                  [9000.9]
                ])

# create scaler
scaler  =  preprocessing.StandardScaler()

# transform the feature
standardized = scaler.fit_transform(feature)

standardized


array([[-0.76058269],
       [-0.54177196],
       [-0.35009716],
       [-0.32271504],
       [ 1.97516685]])

In [4]:
print( "Mean {}".format(round(standardized.mean())))
print("Standard Deviation: {}".format(standardized.std()))

Mean 0
Standard Deviation: 1.0


In [6]:
# using the RobustScaler method:

# create scaler
robust_scaler = preprocessing.RobustScaler()

# transform feature
robust_scaler.fit_transform(feature)

array([[-1.87387612],
       [-0.875     ],
       [ 0.        ],
       [ 0.125     ],
       [10.61488511]])

## Normalizing Observations

In [8]:
import numpy as np
from sklearn.preprocessing import Normalizer

# create feature matrix
features = np.array([
                    [0.5, 0.5],
                    [1.1,	3.4] ,
                    [1.5, 20.2] ,
                    [1.63, 34.4] ,
                    [10.9, 3.3]
                    ])

# create  normalizer
normalizer = Normalizer(norm="l2")

# transofmr feature matrix
normalizer.transform(features)

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [11]:
# transform feature matrix
features_l1_norm =	Normalizer(norm="l1").transform(features)
print("Sum of the first observation's values: {}".format(features_l1_norm[0,0] + features_l1_norm[0, 1]))

Sum of the first observation's values: 1.0


In [12]:
# Display the L1 normalized feature matrix
print("\nL1 Normalized Feature Matrix:")
print(features_l1_norm)


L1 Normalized Feature Matrix:
[[0.5        0.5       ]
 [0.24444444 0.75555556]
 [0.06912442 0.93087558]
 [0.04524008 0.95475992]
 [0.76760563 0.23239437]]


## Grouping Observations Using Clustering

In [15]:
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

features, _ = make_blobs(n_samples=50,
                         n_features = 2,
                         centers = 3,
                         random_state = 1)

df = pd.DataFrame(features, columns = ["feature_1", "feature_2" ])

# make k-means clusterer
clusterer = KMeans(3, random_state=0)

# fit clusterer
clusterer.fit(features)

# predict values
df['group'] = clusterer.predict(features)

df.head()



Unnamed: 0,feature_1,feature_2,group
0,-9.877554,-3.336145,0
1,-7.28721,-8.353986,2
2,-6.943061,-7.023744,2
3,-7.440167,-8.791959,2
4,-6.641388,-8.075888,2


## Deleteing Observations with Missing Values

In [16]:
import numpy as np

features = np.array([
                    [1.1, 11.1],
                    [2.2,	22.2],
                    [3.3, 33.3],
                    [np.nan, 55]
                  ])

# keep only observations that are not (denoted by *) missing
features[~np.isnan(features).any(axis=1)]

array([[ 1.1, 11.1],
       [ 2.2, 22.2],
       [ 3.3, 33.3]])

In [17]:
import pandas as pd
df = pd.DataFrame(features, columns=["feature_1", "feature_2"])
df.dropna()

Unnamed: 0,feature_1,feature_2
0,1.1,11.1
1,2.2,22.2
2,3.3,33.3


## Imputing Missing Values

In [23]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs
from sklearn.impute import SimpleImputer

# make fake data
features, _ = make_blobs(n_samples = 1000,
                         n_features = 2,
                         random_state = 1)

# standardize the features
scaler = StandardScaler()
standardized_features = scaler.fit_transform(features)

# replace the first feature's first value with a missing value
true_value = standardized_features[0, 0]
standardized_features[0,0] = np.nan

# create imputer
mean_imputer =	SimpleImputer(strategy="mean")

# impute values
feautres_mean_imputed =	mean_imputer.fit_transform(features)

# compa re t rue and imputed values
print("True Value: {}".format(true_value))
print("Imputed Value: {}".format(feautres_mean_imputed[0, 0]))

True Value: 0.8730186113995938
Imputed Value: -3.058372724614996
