In [None]:
# Label encoding is to convert a category feature to a code-type number
# Such "01", "02" are the same as string type, which requries coverting process

from sklearn.preprocessing import LabelEncoder

items = ['TV', 'Fridge', 'Microwave', 'PC', 'Fan', 'Fan', 'Blender', 'Blender']

# After create 'LabelEncoder' as an object, conduct label encoding with 'fit()' and 'transform()'
encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items)

print('Encoding converted values:', labels)
print('Encoding Class:', encoder.classes_)
print('Decoded original value:', encoder.inverse_transform([4,5,2,0,1,1,3,3]))


In [None]:
# One-Hot Encoding is to avoid a happening lowering the accuracy of prediction 
# This is the converting process to Sparse matrix

from sklearn.preprocessing import OneHotEncoder
import numpy as np

items = ['TV', 'Fridge', 'Microwave', 'PC', 'Fan', 'Fan', 'Blender', 'Blender']

# Convert to 2D ndarray
items = np.array(items).reshape(-1,1)

# Apply 'OneHotEncoder'
oh_encoder = OneHotEncoder()
oh_encoder.fit(items)
oh_labels = oh_encoder.transform(items)

# Since a result converted by using OneHotEncoder Sparse matrix, Need to covert it by using 'toarray()' to Dense matrix
print('OneHotEncoder data:')
print('\n', oh_labels.toarray())
print('\n Data dimension of OneHotEncoder:', oh_labels.shape)


In [None]:
import pandas as pd

df = pd.DataFrame({'item':['TV', 'Fridge', 'Microwave', 'PC', 'Fan', 'Fan', 'Blender', 'Blender']})

pd.get_dummies(df)

In [None]:
# feature scaling: Standardization and Normalization
# Standardization has each mean value is 0 and variation is 1 (Gaussian normal distribution)
# In Standardization, if a feature x's ith data is x_i-new = (x_i-mean(X))/stdev(x)
# what if Feature A ranges 0 to 100 ft while Feature B has $0 to $100,000,000, the normalization coverts them into 0 to 1 
# x_i-new=(x_i-min(x))/(max(x)-minx(x))
# However, the Normalizer in Scikitlearn is x_i-new=x_i/(SQRT(x_i^2+y_i^2+z_i^2))

In [None]:
# StandardScaler

from sklearn.datasets import load_iris
import pandas as pd

# Load iris data set and convert it to DateFrame
iris = load_iris()
iris_data = iris.data
iris_df = pd.DataFrame(data=iris_data, columns=iris.feature_names)

print('Average value of feastures')
print(iris_df.mean())
print('\nVariance of features')
print(iris_df.var())

In [None]:
from sklearn.preprocessing import StandardScaler

# Generate StandardScaler object
scaler = StandardScaler()
# Convert data set with StandardScaler and call fit() and transform()
scaler.fit(iris_df)
iris_scaled = scaler.transform(iris_df)

# Data set after converted the scale when you did transform(), is returned as Numpy ndarray and return it as DataFrame
iris_df_scaled = pd.DataFrame(data=iris_scaled, columns=iris.feature_names)

print('Average value of feastures')
print(iris_df_scaled.mean())
print('\nVariance of features')
print(iris_df_scaled.var())

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Generate MinMaxScaler object
scaler = MinMaxScaler()
# Convert data set with MinMaxScaler and call fit() and transform()
scaler.fit(iris_df)
iris_scaled = scaler.transform(iris_df)

# Data set after converted the scale when you did transform(), is returned as Numpy ndarray and return it as DataFrame
iris_df_scaled = pd.DataFrame(data=iris_scaled, columns=iris.feature_names)

print('Min value of feastures')
print(iris_df_scaled.min())
print('\nMax of features')
print(iris_df_scaled.max())

In [None]:
# 'fit()' is to set a reference information such as the max/min of data set
# 'transform()' covnert it based on the reference information
# 'fit_tranform()' has a function of these two combination

# It is important once you have done fit() and transform() on your a training data set, you should not do fit() on you a testing data set

from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Generate data set, Traing data 0 to 10 and Testing data 0 to 5
# fit() and transform() of Scaler class are only possible from 2darray, so that covert the dimension with reshape(-1,1)

train_array = np.arange(0,11).reshape(-1,1)
test_array = np.arange(0,6).reshape(-1,1)

# If you don't specify feature_range's parameter value in MinMaxScaler, it coverts value ranging 0 to 1
scaler = MinMaxScaler()

# If you do fit(), data in train_array of Min and Max are 0 and 10 each
scaler.fit(train_array)

# Convert train_array to 1/10 scale (Original data 10 -> 1)
train_scaled = scaler.transform(train_array)

print('Original tranin_array data:', np.round(train_array.reshape(-1),2))
print('Scaled tranin_array data:', np.round(train_scaled.reshape(-1),2))


In [None]:
# In MinMaxScaler, when you do fit() on test_array, the original data set the Min and Max as 0 and 5 for each
scaler.fit(test_array)

# Convert test_array to 1/5 scale (Original data 5 -> 1)
test_scaled = scaler.transform(test_array)

# Return converted scale of test_array
print('Original test_array data:', np.round(test_array.reshape(-1),2))
print('Scaled test_array data:', np.round(test_scaled.reshape(-1),2))

# In this case, we can see the scale is differnt to the previous one and it is not possible to use for ML

In [None]:
# 
scaler = MinMaxScaler()
scaler.fit(train_array)
train_scaled = scaler.transform(train_array)

print('Original tranin_array data:', np.round(train_array.reshape(-1),2))
print('Scaled tranin_array data:', np.round(train_scaled.reshape(-1),2))

# When you do scaling conversion on test_array, you HAVE TO not calling fit(), butt using only transform()
test_scaled = scaler.transform(test_array)
print('\nOriginal tranin_array data:', np.round(test_array.reshape(-1),2))
print('Scaled tranin_array data:', np.round(test_scaled.reshape(-1),2))