# Categorical Encoding and Scaling

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Categorical encoding
- One-hot encoding
- Label Encoding
- custom binary encoding

In [None]:
# One-hot encoding (by get_dummies)
n_samples = 10
height = 3 * np.random.randn(n_samples).round() + 170
nationality = np.random.randint(0, 3, n_samples)
nationality = pd.Series(nationality).map({0:'Korea',
                                          1: 'Japan',
                                          2: 'China'})
print(height)
print(nationality)

In [None]:
df = pd.DataFrame(list(zip(height, nationality)), columns=['height', 'nationality'])
df

In [None]:
new_df = pd.get_dummies(df, columns=['nationality'], prefix='nat')
new_df

In [None]:
# using sklearn function OneHotEncoder()
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
ohe = OneHotEncoder()
encoded = ohe.fit_transform(df.nationality.values.reshape(-1,1))
encoded.toarray()

In [None]:
ohe.categories_, ohe.get_feature_names()

In [None]:
col = ['nat_' + str(ohe.categories_[0][i]) for i in range(len(ohe.categories_[0]))]
col

In [None]:
encoded_df = pd.DataFrame(encoded.toarray(), columns=col)
encoded_df

In [None]:
new_df = pd.concat([df, encoded_df], axis=1)
new_df

In [None]:
new_df.drop('nationality', axis=1, inplace=True)
new_df

In [None]:
# Label Encoding : only a single column (originally for converting a target variable)
df_org = df.copy()
le = LabelEncoder()
df['nat_label_encoded'] = le.fit_transform(df.nationality)
print(le.classes_)
df

In [None]:
# Ordinal Encoding: for converting features (can take many features)
oe = OrdinalEncoder(categories=[['Japan','China','Korea']])
df['nat_ordinal_encoded'] = oe.fit_transform(df.nationality[:,np.newaxis])
df

In [None]:
oe.categories_

In [None]:
df.dtypes

# Practical example
- breast_cancer dataset (breast_cancer2.csv)
- https://github.com/jbrownlee/Datasets/blob/master/breast-cancer.csv

In [None]:
# when using curl, you should use "" in Jupyter, but in Colab, '' also works.

In [None]:
!curl "https://raw.githubusercontent.com/jbrownlee/Datasets/master/breast-cancer.csv" -o breast-cancer2.csv

In [None]:
df = pd.read_csv('breast-cancer2.csv', header=None)
df.head()

In [None]:
df.columns=['age','menopause','tumor_size','inv_nodes','node_caps',
            'deg_malig','breast','breast_quad','irradiat', 'class']
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df['age'].value_counts()

In [None]:
df = df.drop('node_caps', axis=1)
df.head()

In [None]:
df.isna().sum()

In [None]:
df = df.dropna()
df.head().T

In [None]:
df.describe()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
X, y = df.values[:, :-1], df.values[:,-1]
oe = OrdinalEncoder()
X_enc = oe.fit_transform(X)
le = LabelEncoder()
y_enc = le.fit_transform(y)
oe.categories_, le.classes_

In [None]:
X[:5], X_enc[:5]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_enc, y_enc, test_size=0.2, random_state=1)

model = LogisticRegression()
model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
# using One-hot Encoding
X, y = df.values[:, :-1], df.values[:,-1]
ohe = OneHotEncoder()
X_enc2 = ohe.fit_transform(X)
le = LabelEncoder()
y_enc = le.fit_transform(y)
print(ohe.categories_)

X_train, X_test, y_train, y_test = train_test_split(X_enc2, y_enc, test_size=0.2, random_state=1)
model = LogisticRegression()
model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
X_enc2[:5].toarray()

# Scaling
- Minmax scaling
- Standard scaling
- Robust Scaling

In [None]:
df = pd.DataFrame({
    'x1': np.random.normal(0, 2, 10000),
    'x2': np.random.normal(5, 3, 10000),
    'x3': np.random.normal(-5, 5, 10000)
})
df.head()

In [None]:
df.plot.kde()  # kernel density estimate

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
ss = StandardScaler()
data_tf = ss.fit_transform(df)     # returns an array
df = pd.DataFrame(data_tf, columns=['x1','x2','x3'])
df.plot.kde()

In [None]:
df = pd.DataFrame({
    # positive skew
    'x1': np.random.chisquare(8, 1000),
    # negative skew 
    'x2': np.random.beta(8, 2, 1000) * 40,
    # no skew
    'x3': np.random.normal(50, 3, 1000)
})

df.plot.kde()

In [None]:
mm = MinMaxScaler()
data_tf = mm.fit_transform(df)
df1 = pd.DataFrame(data_tf,columns=['x1','x2','x3'])

sc = StandardScaler()
data_tf = sc.fit_transform(df)
df2 = pd.DataFrame(data_tf,columns=['x1','x2','x3'])

rb = RobustScaler()
data_tf = rb.fit_transform(df)
df3 = pd.DataFrame(data_tf,columns=['x1','x2','x3'])

fig, axes = plt.subplots(1, 3, figsize=(16,6))
df1.plot.kde(ax=axes[0])
df2.plot.kde(ax=axes[1])
df3.plot.kde(ax=axes[2])

In [None]:
df = pd.DataFrame({
    # Distribution with lower outliers
    'x1': np.concatenate([np.random.normal(10, 10, 1000), np.random.normal(-90, 1, 50)]),
    # Distribution with higher outliers
    'x2': np.concatenate([np.random.normal(30, 20, 1000), np.random.normal(200, 2, 50)]),
})
df.plot.kde()

In [None]:
sc = StandardScaler()
data_tf = sc.fit_transform(df)
df1 = pd.DataFrame(data_tf,columns=['x1','x2'])

rb = RobustScaler()
data_tf = rb.fit_transform(df)
df2 = pd.DataFrame(data_tf,columns=['x1','x2'])

fig, axes = plt.subplots(1, 2, figsize=(14,3))
df1.plot.kde(ax=axes[0])
df2.plot.kde(ax=axes[1])

# For your reference

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(12,4))
df.boxplot(ax=axes[0])
df1.boxplot(ax=axes[1])
df2.boxplot(ax=axes[2])

In [None]:
df = pd.DataFrame(np.random.normal(10, 10, 1000))
sc = StandardScaler()
data_tf = sc.fit_transform(df)
df1 = pd.DataFrame(data_tf)

fig, axes = plt.subplots(1, 2, figsize=(12,4))
df.boxplot(ax=axes[0])
df1.boxplot(ax=axes[1])