### OneHot Encode in a simple numpy array where data is label enocoded.

In [1]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# Given matrix
matrix = np.array([
    [16.2, 13.8, 47.1, 2],
    [16.7, 15.59, 44.53, 0],
    [15.51, 10.55, 40.54, 1],
    [14.41, 11.85, 38.62, 2]
])

matrix

array([[16.2 , 13.8 , 47.1 ,  2.  ],
       [16.7 , 15.59, 44.53,  0.  ],
       [15.51, 10.55, 40.54,  1.  ],
       [14.41, 11.85, 38.62,  2.  ]])

In [2]:
# Extract the last column for encoding
# need to reshape because sklearn.onehotencode expects a 2D array

last_column = matrix[:, -1].reshape(-1, 1) # -1 means as many rows as needed, 1 means 1 column
last_column

array([[2.],
       [0.],
       [1.],
       [2.]])

In [3]:
# Initialize OneHotEncoder
onehot_encoder = OneHotEncoder(sparse_output=False)

In [4]:
# Encode the last column
encoded_column = onehot_encoder.fit_transform(last_column)
encoded_column

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [5]:
# Insert the encoded columns back into the original matrix

encoded_matrix = np.hstack((matrix[:, :-1], encoded_column))  # hstack means horizontal stack (add columns)
encoded_matrix

array([[16.2 , 13.8 , 47.1 ,  0.  ,  0.  ,  1.  ],
       [16.7 , 15.59, 44.53,  1.  ,  0.  ,  0.  ],
       [15.51, 10.55, 40.54,  0.  ,  1.  ,  0.  ],
       [14.41, 11.85, 38.62,  0.  ,  0.  ,  1.  ]])

#### Create a numpy array practise

In [6]:
import numpy as np


In [7]:
# Define the data types for the structured array
dtypes = [('ID', int), ('Name', 'U10'), ('Age', int), ('Gender', 'U10')]
dtypes

[('ID', int), ('Name', 'U10'), ('Age', int), ('Gender', 'U10')]

In [8]:
# Create a structured array
# np.empty creates  an array of the given shape and data type, without initializing entries

data_array = np.empty(6, dtype=dtypes) # 3 rows
print(type(data_array))
data_array

<class 'numpy.ndarray'>


array([(0, '', 0, ''), (0, '', 0, ''), (0, '', 0, ''), (0, '', 0, ''),
       (0, '', 0, ''), (0, '', 0, '')],
      dtype=[('ID', '<i4'), ('Name', '<U10'), ('Age', '<i4'), ('Gender', '<U10')])

In [9]:
# Populate the structured array with data
data_array['ID'] = [1, 2, 3, 4, 5, 6]
data_array['Name'] = ['John', 'Jane', 'Jim', 'Mark', 'Mary', 'Mike']    # U10 means 10 characters
data_array['Age'] = [23, 45, 12, 67, 13, 34]
data_array["Gender"] = ['F', 'M', 'M', 'M', 'F', 'M']

In [10]:
data_array

array([(1, 'John', 23, 'F'), (2, 'Jane', 45, 'M'), (3, 'Jim', 12, 'M'),
       (4, 'Mark', 67, 'M'), (5, 'Mary', 13, 'F'), (6, 'Mike', 34, 'M')],
      dtype=[('ID', '<i4'), ('Name', '<U10'), ('Age', '<i4'), ('Gender', '<U10')])

## Now implement onehotencoder


In [11]:
original_array = np.array([
    (1, 'John', 23, 'F'),
    (2, 'Jane', 45, 'M'),
    (3, 'Jim', 12, 'M'),
    (4, 'Mark', 67, 'M'),
    (5, 'Mary', 13, 'F'),
    (6, 'Mike', 34, 'M')
])
original_array

array([['1', 'John', '23', 'F'],
       ['2', 'Jane', '45', 'M'],
       ['3', 'Jim', '12', 'M'],
       ['4', 'Mark', '67', 'M'],
       ['5', 'Mary', '13', 'F'],
       ['6', 'Mike', '34', 'M']], dtype='<U11')

In [12]:
# extract the Gender column
gender_column = original_array[:,3].reshape(-1, 1)
gender_column

array([['F'],
       ['M'],
       ['M'],
       ['M'],
       ['F'],
       ['M']], dtype='<U11')

In [13]:
#initialize OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
onehot_encoder = OneHotEncoder(sparse_output=False) # sparse_output=False means return a numpy array


In [14]:
# we don't need to reshape the column because it's already 2D
# LabelEncoder is not needed because the data is already encoded as strings


gender_encode = onehot_encoder.fit_transform(gender_column)
gender_encode

array([[1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.]])

In [15]:
# Merge the encoded column back into the original array
# hstack means horizontal stack (add columns)

# Insert the encoded columns back into the original matrix


merged_array = np.hstack((original_array,gender_encode))  # all columns
# merged_array = np.hstack((original_array[:, :-1],gender_encode))  # all columns except last one

merged_array

array([['1', 'John', '23', 'F', '1.0', '0.0'],
       ['2', 'Jane', '45', 'M', '0.0', '1.0'],
       ['3', 'Jim', '12', 'M', '0.0', '1.0'],
       ['4', 'Mark', '67', 'M', '0.0', '1.0'],
       ['5', 'Mary', '13', 'F', '1.0', '0.0'],
       ['6', 'Mike', '34', 'M', '0.0', '1.0']], dtype='<U32')

#### Below is the example of merging two array without custom dtype issues


In [16]:
import numpy as np

# Original structured array
original_array = np.array([
    (1, 'John', 23, 'F'),
    (2, 'Jane', 45, 'M'),
    (3, 'Jim', 12, 'M'),
    (4, 'Mark', 67, 'M'),
    (5, 'Mary', 13, 'F'),
    (6, 'Mike', 34, 'M')
])
print("Data type:", original_array.dtype)
# One-hot encoded array
one_hot_encoded = np.array([
    [1., 0.],
    [0., 1.],
    [0., 1.],
    [0., 1.],
    [1., 0.],
    [0., 1.]
])


# Merge the original array and the one-hot encoded array
merged_array = np.column_stack((original_array, one_hot_encoded))

# Display the merged array
print(merged_array)


Data type: <U11
[['1' 'John' '23' 'F' '1.0' '0.0']
 ['2' 'Jane' '45' 'M' '0.0' '1.0']
 ['3' 'Jim' '12' 'M' '0.0' '1.0']
 ['4' 'Mark' '67' 'M' '0.0' '1.0']
 ['5' 'Mary' '13' 'F' '1.0' '0.0']
 ['6' 'Mike' '34' 'M' '0.0' '1.0']]
