# Handling categorical data

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = [
    [10.1, 'M', 'green', 'class1'],
    [13.5, 'L', 'red', 'class2'],
    [15.3, 'XL', 'blue', 'class1'],
    [11.3, 'M', 'red', 'class1'],
    [13.2, 'L', 'blue', 'class2']
]

In [5]:
df = pd.DataFrame(data, columns = ['price', 'size', 'color', 'y'])
df

Unnamed: 0,price,size,color,y
0,10.1,M,green,class1
1,13.5,L,red,class2
2,15.3,XL,blue,class1
3,11.3,M,red,class1
4,13.2,L,blue,class2


# Ordinal Encoding

In [6]:
from sklearn.preprocessing import OrdinalEncoder

In [7]:
ode = OrdinalEncoder(categories=[['M', 'L', 'XL']], dtype=np.int16)

In [13]:
df2 = df.copy()
df2['size'] = ode.fit_transform(df[['size']])
df2

Unnamed: 0,price,size,color,y
0,10.1,0,green,class1
1,13.5,1,red,class2
2,15.3,2,blue,class1
3,11.3,0,red,class1
4,13.2,1,blue,class2


In [10]:
ode.categories_

[array(['M', 'L', 'XL'], dtype=object)]

In [14]:
ode.inverse_transform([
    [1],
    [2],
    [0]
])

array([['L'],
       ['XL'],
       ['M']], dtype=object)

# OneHot Encoding

In [20]:
from sklearn.preprocessing import OneHotEncoder
one = OneHotEncoder(categories=[['green', 'red', 'blue']], dtype=np.int32)

In [24]:
one.categories

[['green', 'red', 'blue']]

In [31]:
re_one = one.fit_transform(df2[['color']])
re_one.toarray()

array([[1, 0, 0],
       [0, 1, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1]], dtype=int32)

In [47]:
temp = pd.DataFrame(re_one.toarray(), columns = one.categories[0])
temp

Unnamed: 0,green,red,blue
0,1,0,0
1,0,1,0
2,0,0,1
3,0,1,0
4,0,0,1


In [48]:
df3 = pd.concat([df2, temp], axis = 1)
df3.drop(columns = ['color'], inplace = True)
df3

Unnamed: 0,price,size,y,green,red,blue
0,10.1,0,class1,1,0,0
1,13.5,1,class2,0,1,0
2,15.3,2,class1,0,0,1
3,11.3,0,class1,0,1,0
4,13.2,1,class2,0,0,1


# Using get_dummies instate of OneHotEncoding

In [51]:
pd.get_dummies(df['color'], dtype=np.int32)

Unnamed: 0,blue,green,red
0,0,1,0
1,0,0,1
2,1,0,0
3,0,0,1
4,1,0,0


# Label encoder

In [57]:
from sklearn.preprocessing import LabelEncoder

In [59]:
lec = LabelEncoder()

In [61]:
lec_re = lec.fit_transform(df['y'])
lec_re

array([0, 1, 0, 0, 1])

In [62]:
df4 = df3.copy()

In [63]:
df4['y'] = lec_re

In [64]:
df4

Unnamed: 0,price,size,y,green,red,blue
0,10.1,0,0,1,0,0
1,13.5,1,1,0,1,0
2,15.3,2,0,0,0,1
3,11.3,0,0,0,1,0
4,13.2,1,1,0,0,1
