In [3]:
# Chapter 5. Handling Categorical Data
# 5.0 Introduction

# transform the data in a way that properly 
# captures the information in the categories 

In [3]:
# 5.1 Encoding Nominal Categorical Feature

import numpy as np
from sklearn.preprocessing import LabelBinarizer , MultiLabelBinarizer

# create features
feature = np.array([["Texas"],
            ["California"],
            ["Texas"],
            ["Delaware"],
            ["Texas"]])

# create one hot encoder
one_hot = LabelBinarizer()
one_hot.fit_transform(feature)


array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1]])

In [4]:
# view feature classes
one_hot.classes_

array(['California', 'Delaware', 'Texas'], dtype='<U10')

In [5]:
# reverse one hot encoding
one_hot.inverse_transform(one_hot.fit_transform(feature))

array(['Texas', 'California', 'Texas', 'Delaware', 'Texas'], dtype='<U10')

In [18]:
import pandas as pd

# create dummy variable from feature
pd.get_dummies(feature[:,0])

Unnamed: 0,California,Delaware,Texas
0,False,False,True
1,True,False,False
2,False,False,True
3,False,True,False
4,False,False,True


In [24]:
# create multiclass feature

multiclass_feature = [("Texas", "Florida"),
                      ("California", "Alabama"),
                      ("Texas", "Florida"),
                      ("Delaware", "Florida"),
                      ("Texas", "Alabama")]

# create multiclass one hot encoder
one_hot_multiclass = MultiLabelBinarizer()
one_hot_multiclass.fit_transform(multiclass_feature)

array([[0, 0, 0, 1, 1],
       [1, 1, 0, 0, 0],
       [0, 0, 0, 1, 1],
       [0, 0, 1, 1, 0],
       [1, 0, 0, 0, 1]])

In [None]:
one_hot_multiclass.classes_

array(['Alabama', 'California', 'Delaware', 'Florida', 'Texas'],
      dtype=object)

In [33]:
# 5.2 Encoding Ordinal Categorical Features

#  create features
dataframe = pd.DataFrame({"Score" : ["Low","Low" , "Medium" , "Medium" , "High"]})

# create mapper
# the difference between the scale must be 1 -> 3-2=1 , 2-1=1
scale_mapper = {"Low" :1,
                "Medium" : 2,
                "High" : 3 }

# replace feature values with scale
dataframe["Score"].replace(scale_mapper)

  dataframe["Score"].replace(scale_mapper)


0    1
1    1
2    2
3    2
4    3
Name: Score, dtype: int64

In [34]:

#  create features
dataframe2 = pd.DataFrame({"Score" : ["Low","Low" , "Medium" , "Medium" , "High"] ,
                          "point" : ["Low","Low" , "Medium" , "Medium" , "High"]})


# replace feature values with scale
dataframe2[["Score" , "point"]].replace(scale_mapper)

  dataframe2[["Score" , "point"]].replace(scale_mapper)


Unnamed: 0,Score,point
0,1,1
1,1,1
2,2,2
3,2,2
4,3,3


In [40]:
dataframe = pd.DataFrame({"Score": ["Low",
                                    "Low",
                                    "Medium",
                                    "Medium",
                                    "High",
                                    "Barely More Than Medium"]})
scale_mapper = {"Low":1,
                "Medium":2,
                "Barely More Than Medium":3,
                "High":4}

dataframe["Score"].replace(scale_mapper)

  dataframe["Score"].replace(scale_mapper)


0    1
1    1
2    2
3    2
4    4
5    3
Name: Score, dtype: int64

In [None]:
scale_mapper = {"Low":1,
                "Medium":2,
                "Barely More Than Medium":2.1, # better to be  conscious about the numerical values mapped to classes
                "High":3}

dataframe["Score"].replace(scale_mapper)

  dataframe["Score"].replace(scale_mapper)


0    1.0
1    1.0
2    2.0
3    2.0
4    3.0
5    2.1
Name: Score, dtype: float64

In [51]:
# 5.3 Encoding Dictionaries of Features
from sklearn.feature_extraction import DictVectorizer

# create a dictionary
# a list of dictionaries
data_dict = [{"Red": 2, "Blue": 4},
             {"Red": 4, "Blue": 3},
             {"Red": 1, "Yellow": 2},
             {"Red": 2, "Yellow": 2}]

# create a dictionary vectorize
dictvectorize = DictVectorizer(sparse=False)

# convert dictionary to feature matrix
features = dictvectorize.fit_transform(data_dict)

features

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

In [50]:
# get feature names
feature_names = dictvectorize.get_feature_names_out()
feature_names

array(['Blue', 'Red', 'Yellow'], dtype=object)

In [52]:
# create a dataframe from features

pd.DataFrame(features , columns=feature_names)

Unnamed: 0,Blue,Red,Yellow
0,4.0,2.0,0.0
1,3.0,4.0,0.0
2,0.0,1.0,2.0
3,0.0,2.0,2.0


In [53]:
# Create word count dictionaries for four documents
doc_1_word_count = {"Red": 2, "Blue": 4}
doc_2_word_count = {"Red": 4, "Blue": 3}
doc_3_word_count = {"Red": 1, "Yellow": 2}
doc_4_word_count = {"Red": 2, "Yellow": 2}

# Create list
doc_word_counts = [doc_1_word_count,
                   doc_2_word_count,
                   doc_3_word_count,
                   doc_4_word_count]

dictvectorize.fit_transform(doc_word_counts)

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

In [None]:
# 5.4 Imputing Missing Class Values
from sklearn.neighbors import KNeighborsClassifier

# create a feature matrix with categorical feature
X = np.array([[0, 2.10, 1.45],
              [1, 1.18, 1.33],
              [0, 1.22, 1.27],
              [1, -0.21, -1.19]])

# create a feature matrix with missing values in the categorical feature
X_with_nan = np.array([[np.nan, 0.87, 1.31],
                       [np.nan, -0.67, -0.22]])        

# train KNN learner
clf = KNeighborsClassifier(3, weights = 'distance')  
# X[: ,1:] feature data to train ,   X[: ,0] target
trained_model = clf.fit(X[: ,1:] , X[: ,0])

# predict class of missing values
imputed_values = trained_model.predict(X_with_nan[: , 1:])

# join column of predicted class with their other
X_with_imputed = np.hstack((imputed_values.reshape(-1,1) , X_with_nan[: , 1:]))

# join tow feature matrix
np.vstack((X_with_imputed, X))

array([[ 0.  ,  0.87,  1.31],
       [ 1.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

In [62]:
from sklearn.impute import SimpleImputer

# join the two feature matrix
X_complete = np.vstack((X_with_nan , X))

imputer = SimpleImputer(strategy='most_frequent')

imputer.fit_transform(X_complete)

array([[ 0.  ,  0.87,  1.31],
       [ 0.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

In [66]:
# 5.5 Handling Imbalanced Classes

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

# load iris data
iris = load_iris()

# create feature matrix
features = iris.data

# create target vector
target = iris.target

# remove first 40 observations

features = features[40: , :]
target = target[40:]

# create binary target vector indicating if class 0
target = np.where((target == 0) , 0, 1)

target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [69]:
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [70]:
# create weights
weights = {0: 0.9 , 1: 0.1}

# Create random forest classifier with weights
RandomForestClassifier(class_weight=weights)


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [71]:
# Train a random forest with balanced class weights
RandomForestClassifier(class_weight="balanced")

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [78]:
# Indicies of each class's observations
i_class0 = np.where(target == 0)[0]
i_class1 = np.where(target == 1)[0]
                                
# Number of observations in each class
n_class0 = len(i_class0)
n_class1 = len(i_class1)

In [None]:
# down sample
# For every observation of class 0, 
# randomly sample  from class 1 without replacement

i_class1_downsampled = np.random.choice(i_class1 , size = n_class0 ,replace=False)

# join 0 class and downsampled 1 class

np.hstack((target[i_class0] , target[i_class1_downsampled]))

# join class 0 and downsampled class1 feature matrix 
np.vstack((features[i_class0 , :] , features[i_class1_downsampled, :]))[0:5]

array([[5. , 3.5, 1.3, 0.3],
       [4.5, 2.3, 1.3, 0.3],
       [4.4, 3.2, 1.3, 0.2],
       [5. , 3.5, 1.6, 0.6],
       [5.1, 3.8, 1.9, 0.4]])

In [88]:
# up sample
# For every observation in class 1, randomly sample from class 0 with replacement
i_class1_upsampled = np.random.choice(i_class0 , size = n_class1 , replace=True)

# join together
np.concatenate((target[i_class1_upsampled] , target[i_class1]))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

In [89]:
i_class1_upsampled

array([3, 8, 5, 4, 3, 1, 2, 9, 3, 9, 7, 8, 1, 3, 6, 7, 3, 5, 1, 3, 7, 3,
       6, 5, 2, 3, 7, 6, 6, 1, 9, 9, 2, 3, 7, 9, 5, 7, 9, 4, 9, 6, 7, 8,
       8, 7, 6, 1, 0, 0, 7, 7, 7, 9, 9, 1, 7, 6, 0, 4, 5, 4, 1, 8, 3, 0,
       3, 0, 4, 0, 1, 8, 8, 3, 9, 6, 2, 5, 9, 7, 2, 0, 2, 0, 2, 1, 9, 0,
       8, 1, 8, 1, 3, 4, 0, 9, 0, 9, 0, 1])

In [90]:
# join features matrix
np.vstack((features[i_class1_upsampled, : ] , features[i_class1, : ]))[0:5]

array([[5. , 3.5, 1.6, 0.6],
       [5.3, 3.7, 1.5, 0.2],
       [4.8, 3. , 1.4, 0.3],
       [5.1, 3.8, 1.9, 0.4],
       [5. , 3.5, 1.6, 0.6]])