In [50]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, LabelBinarizer, MultiLabelBinarizer
import numpy as np
import pandas as pd

In [13]:
encoder = OneHotEncoder(handle_unknown='ignore')

In [14]:
majors = [
    ['Engineering'],
    ['Math'],
    ['Chemistry']
]

In [15]:
encoder.fit(majors)

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='ignore', sparse=True)

In [16]:
encoder.transform(majors).toarray()

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [17]:
encoder.categories_

[array(['Chemistry', 'Engineering', 'Math'], dtype=object)]

In [18]:
new_majors = [
    ['Media Studies'],
    ['Math'],
    ['Stats']
]

In [19]:
encoder.transform(new_majors).toarray()

array([[0., 0., 0.],
       [0., 0., 1.],
       [0., 0., 0.]])

In [20]:
students = pd.DataFrame({
    'StudentID' : [1,2,3,4,5,6],
    'Department' : ['Engineering', 'Science', 'Art', 'Engineering', 'Business','BA'],
    'Nationality' : ['Ghana', 'Nigeria', 'Nigeria', 'Nigeria', 'Egypt', 'Togo'],
    'Batch' : [2010, 2001, 2004, 2010, 2010, 2001]
})

In [21]:
students

Unnamed: 0,StudentID,Department,Nationality,Batch
0,1,Engineering,Ghana,2010
1,2,Science,Nigeria,2001
2,3,Art,Nigeria,2004
3,4,Engineering,Nigeria,2010
4,5,Business,Egypt,2010
5,6,BA,Togo,2001


In [23]:
deptEncoder = OneHotEncoder()
deptTransformed = deptEncoder.fit_transform(students[['Department']])

In [24]:
deptTransformed.toarray()

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [26]:
deptEncoder.categories_

[array(['Art', 'BA', 'Business', 'Engineering', 'Science'], dtype=object)]

In [28]:
dept_df = pd.DataFrame(deptTransformed.toarray(), columns = deptEncoder.categories_, dtype=np.int)

In [29]:
dept_df

Unnamed: 0,Art,BA,Business,Engineering,Science
0,0,0,0,1,0
1,0,0,0,0,1
2,1,0,0,0,0
3,0,0,0,1,0
4,0,0,1,0,0
5,0,1,0,0,0


In [30]:
 students = pd.concat([students, dept_df], axis=1)

In [31]:
students

Unnamed: 0,StudentID,Department,Nationality,Batch,"(Art,)","(BA,)","(Business,)","(Engineering,)","(Science,)"
0,1,Engineering,Ghana,2010,0,0,0,1,0
1,2,Science,Nigeria,2001,0,0,0,0,1
2,3,Art,Nigeria,2004,1,0,0,0,0
3,4,Engineering,Nigeria,2010,0,0,0,1,0
4,5,Business,Egypt,2010,0,0,1,0,0
5,6,BA,Togo,2001,0,1,0,0,0


In [33]:
num_encoder = LabelEncoder()

In [34]:
num_encoder.fit([50,20,60,60])

LabelEncoder()

In [36]:
num_encoder.classes_

array([20, 50, 60])

In [37]:
# label encoder on multiple columns
encoder = LabelEncoder()
students_encoded = students.apply(encoder.fit_transform)

In [38]:
students_encoded

Unnamed: 0,StudentID,Department,Nationality,Batch,"(Art,)","(BA,)","(Business,)","(Engineering,)","(Science,)"
0,0,3,1,2,0,0,0,1,0
1,1,4,2,0,0,0,0,0,1
2,2,0,2,1,1,0,0,0,0
3,3,3,2,2,0,0,0,1,0
4,4,2,0,2,0,0,1,0,0
5,5,1,3,0,0,1,0,0,0


In [41]:
# Label binarizer
num_binarizer = LabelBinarizer() # converts multi class to binary class

In [42]:
num_binarizer.fit([2, 5, 6, 4, 5]) 

LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)

In [43]:
num_binarizer.classes_

array([2, 4, 5, 6])

In [44]:
num_binarizer.transform([2,5,6,4,5])

array([[1, 0, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 1],
       [0, 1, 0, 0],
       [0, 0, 1, 0]])

In [45]:
num_binarizer.transform([2,6])

array([[1, 0, 0, 0],
       [0, 0, 0, 1]])

In [46]:
temp_binarizer = LabelBinarizer(neg_label=-1,
                               pos_label=1,
                               sparse_output=False)

In [47]:
temperature = [
    'cold',
    'cold',
    'warm',
    'hot',
    'hot'
]

In [49]:
temp_binarizer.fit_transform(temperature)

array([[ 1, -1, -1],
       [ 1, -1, -1],
       [-1, -1,  1],
       [-1,  1, -1],
       [-1,  1, -1]])

In [51]:
multilabel_binarizer = MultiLabelBinarizer()

In [56]:
courses = [
    ('Math', 'English'),
    ('Math', 'Science'),
    ('Geography', 'History'),
    ('Statistics', 'Science')
]

In [57]:
multilabel_binarizer.fit(courses)

MultiLabelBinarizer(classes=None, sparse_output=False)

In [58]:
multilabel_binarizer.classes_

array(['English', 'Geography', 'History', 'Math', 'Science', 'Statistics'],
      dtype=object)

In [59]:
multilabel_binarizer.transform(courses)

array([[1, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 1, 0],
       [0, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 1, 1]])

In [60]:
new_courses = [
    ('Math', 'Statistics'),
    ('Geography', 'History', 'Math')
]

In [61]:
multilabel_binarizer.transform(new_courses)

array([[0, 0, 0, 1, 0, 1],
       [0, 1, 1, 1, 0, 0]])