In [3]:
# 4.1 Rescaling a Feature
import numpy as np
from sklearn import preprocessing

# create a feature
feature = np.array([[-500.5],
                   [-100.1],
                   [0],
                   [100.1],
                   [900.9]])

# create a scaler
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))

# Scale feature
scaled_feature = minmax_scale.fit_transform(feature)

# show feature
scaled_feature

array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857143],
       [1.        ]])

In [4]:
# 4.2 Standardizing a Feature
#create a feature
x = np.array([[-1000.1],
              [-200.2],
              [500.5],
              [600.6],
              [9000.9]])
# create a scaler
scaler = preprocessing.StandardScaler()

# transform the feature
standardized = scaler.fit_transform(x)

standardized

array([[-0.76058269],
       [-0.54177196],
       [-0.35009716],
       [-0.32271504],
       [ 1.97516685]])

In [5]:
# print mean and standard deviation
print("Mean: ", round(standardized.mean()))
print("Standard Deviation: ", round(standardized.std()))

Mean:  0
Standard Deviation:  1


In [6]:
# create a scaler
robust_scaler = preprocessing.RobustScaler()
robust_scaler.fit_transform(x)

array([[-1.87387612],
       [-0.875     ],
       [ 0.        ],
       [ 0.125     ],
       [10.61488511]])

In [7]:
# 4.3 Normalizing Observations
from sklearn.preprocessing import Normalizer

# create feature matrix
features = np.array([[0.5, 0.5],
                    [1.1, 3.4],
                    [1.5, 20.2],
                    [1.63, 34.4],
                    [10.9, 3.3]])

normalizer = Normalizer(norm="l2")

# transform feature matrix
normalizer.transform(features)

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [8]:
# transform feature matrix
features_l2_norm = Normalizer(norm="l2").transform(features)

features_l2_norm

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [9]:
# transform feature matrix
features_l1_norm = Normalizer(norm="l1").transform(features)
features_l1_norm

array([[0.5       , 0.5       ],
       [0.24444444, 0.75555556],
       [0.06912442, 0.93087558],
       [0.04524008, 0.95475992],
       [0.76760563, 0.23239437]])

In [10]:
# print sum
print("Sum of the first observation\'s values:",
      features_l1_norm[0,0] + features_l1_norm[0,1])

Sum of the first observation's values: 1.0


In [11]:
# 4.4 Generating Polynomial and Interaction Features
from sklearn.preprocessing import PolynomialFeatures
# create feature matrix
features = np.array([[2,3],
                     [2,3],
                     [2,3]])

# create polynomial features object
polynomial_interaction = PolynomialFeatures(degree=2,include_bias=False)

polynomial_interaction.fit_transform(features)


array([[2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.]])

In [12]:
# ignore square of value , only values and their interactions
interaction = PolynomialFeatures(degree=2 ,
                interaction_only=True , include_bias=False)

interaction.fit_transform(features)

array([[2., 3., 6.],
       [2., 3., 6.],
       [2., 3., 6.]])

In [13]:
# 4.5 Transforming Features

from sklearn.preprocessing import FunctionTransformer

features = np.array([[2,3],
                     [2,3],
                     [2,3]])

# define a simple function
def add_ten(x:int) -> int:
    return x+10

# create a transformer
ten_transformer = FunctionTransformer(add_ten)

# transform feature matrix
ten_transformer.transform(features)

array([[12, 13],
       [12, 13],
       [12, 13]])

In [14]:
import pandas as pd

df = pd.DataFrame(features,columns=['feature1','feature2'])

# apply function
df.apply(add_ten)

Unnamed: 0,feature1,feature2
0,12,13
1,12,13
2,12,13


In [15]:
# 4.6 Detecting Outliers
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs

# create simulated data
features,_ = make_blobs(n_samples=10,
                        n_features=2,
                        centers=1,
                        random_state=1)

features[0,0] = 10000
features[0,1] = 10000

# create detector
# contamination between 0.1 and 0.5
outlier_detector = EllipticEnvelope(contamination = 0.1)

# fit detector
outlier_detector.fit(features)

# predict outliers
outlier_detector.predict(features)

# -1 outliers and 1 inliers

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [16]:
outlier_detector = EllipticEnvelope(contamination = 0.5)

outlier_detector.fit(features)
outlier_detector.predict(features)

array([-1, -1,  1,  1, -1, -1,  1,  1, -1,  1])

In [17]:
# create one feature
feature = features[:,0]

# create a function to return index of outliers
def index_of_outliers(x:int) -> np.array(int):
    q1, q3  = np.percentile(x,[25,75])
    iqr = q3 - q1
    lower_bound = q1 - (iqr * 1.5)
    upper_bound = q3 + (iqr*1.5)
    return np.where((x > upper_bound) | (x < lower_bound))

# run function
index_of_outliers(feature)

(array([0]),)

In [18]:
# find index of outliers on whole 2d dataset
index_of_outliers(features)

(array([0, 0]), array([0, 1]))

In [19]:
# 4.7 Handling Outliers
import pandas as pd

# create  dataframe
houses = pd.DataFrame()
houses['Price'] = [534433, 392333, 293222, 4322032]
houses['Bathrooms'] = [2, 3.5, 2, 116]
houses['Square_Feet'] = [1500, 2500, 1500, 48000]

# filter observation
houses[houses['Bathrooms'] < 20]


Unnamed: 0,Price,Bathrooms,Square_Feet
0,534433,2.0,1500
1,392333,3.5,2500
2,293222,2.0,1500


In [20]:
# create feature based on boolean condition
houses['Outlier'] = np.where(houses["Bathrooms"] <20, 0 ,1)

# show data
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outlier
0,534433,2.0,1500,0
1,392333,3.5,2500,0
2,293222,2.0,1500,0
3,4322032,116.0,48000,1


In [21]:
# log feature
houses["Log_Of_Square_Feet"] = [np.log(x) for x in houses["Square_Feet"]]
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outlier,Log_Of_Square_Feet
0,534433,2.0,1500,0,7.31322
1,392333,3.5,2500,0,7.824046
2,293222,2.0,1500,0,7.31322
3,4322032,116.0,48000,1,10.778956


In [22]:
# 4.8 Discretizating Features
from sklearn.preprocessing import Binarizer

# create feature
age = np.array([[6],
                [12],
                [20],
                [36],
                [65]])
# create binarizer
binarizer = Binarizer(threshold=18)

#transform feature
binarizer.fit_transform(age)

array([[0],
       [0],
       [1],
       [1],
       [1]])

In [23]:
# bin feature
np.digitize(age , bins=[20,30,64])

array([[0],
       [0],
       [1],
       [2],
       [3]])

In [24]:
# bin feature
np.digitize(age , bins=[20,30,64], right=True)

array([[0],
       [0],
       [0],
       [2],
       [3]])

In [26]:
np.digitize(age, bins=[18])

array([[0],
       [0],
       [1],
       [1],
       [1]])

In [44]:
# 4.9 Grouping Observations Using Clustering
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

# Make simulated feature matrix
features, _ = make_blobs(n_samples = 50,
                        n_features = 2,
                        centers = 3,
                        random_state = 1)

dataframe = pd.DataFrame(features, columns=["feature_1", "feature_2"])

# make k means clusterer
clusterer = KMeans(3,random_state=0)

clusterer.fit(features)

# predict values
dataframe["group"]=clusterer.predict(features)

dataframe.head(5)

Unnamed: 0,feature_1,feature_2,group
0,-9.877554,-3.336145,2
1,-7.28721,-8.353986,0
2,-6.943061,-7.023744,0
3,-7.440167,-8.791959,0
4,-6.641388,-8.075888,0


In [36]:
# 4.10 Deleting Observations with Missing Values
# Create feature matrix
features = np.array([[1.1, 11.1],
                    [2.2, 22.2],
                    [3.3, 33.3],
                    [4.4, 44.4],
                    [np.nan, 55]])

# Keep only observations that are not (denoted by ~) missing
features[~np.isnan(features).any(axis=1)]

array([[ 1.1, 11.1],
       [ 2.2, 22.2],
       [ 3.3, 33.3],
       [ 4.4, 44.4]])

In [37]:
dataframe = pd.DataFrame(features, columns=["feature_1", "feature_2"])

# remove observations with missing values
dataframe.dropna()

Unnamed: 0,feature_1,feature_2
0,1.1,11.1
1,2.2,22.2
2,3.3,33.3
3,4.4,44.4


In [47]:
# 4.11 Imputing Missing Values
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs

# Make a simulated feature matrix
features, _ = make_blobs(n_samples = 1000,
                        n_features = 2,
                        random_state = 1)

# standardize the feature
scaler = StandardScaler()
standardized_features = scaler.fit_transform(features)

# Replace the first feature's first value with a missing value
true_value = standardized_features[0,0]
standardized_features[0,0] = np.nan

# predict the missing values in the feature matrix
knn_imputer = KNNImputer(n_neighbors=5)
features_knn_imputed  = knn_imputer.fit_transform(standardized_features)

# compare true and imputes values
print("True Value:", true_value)
print("Imputed Value:", features_knn_imputed[0,0])


True Value: 0.8730186113995938
Imputed Value: 1.0959262913919632


In [None]:
# Load libraries
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs

# Make a simulated feature matrix
features, _ = make_blobs(n_samples = 1000,
                        n_features = 2,
                        random_state = 1)
# Standardize the features
scaler = StandardScaler()

standardized_features = scaler.fit_transform(features)

# Replace the first feature's first value with a missing value
true_value = standardized_features[0,0]
standardized_features[0,0] = np.nan

# Create imputer using the "mean" strategy
mean_imputer = SimpleImputer(strategy="mean")

# Impute values
features_mean_imputed = mean_imputer.fit_transform(features)

# Compare true and imputed values
print("True Value:", true_value)
print("Imputed Value:", features_mean_imputed[0,0])

True Value: 0.8730186113995938
Imputed Value: -3.058372724614996
