# Practing Categorical Columns Encoding

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder

In [2]:
df = pd.read_csv('Employers_data.csv')
df.head()

Unnamed: 0,Employee_ID,Name,Age,Gender,Department,Job_Title,Experience_Years,Education_Level,Location,Salary
0,1,Merle Ingram,24,Female,Engineering,Engineer,1,Master,Austin,90000
1,2,John Mayes,56,Male,Sales,Executive,33,Master,Seattle,195000
2,3,Carlos Wille,21,Male,Engineering,Intern,1,Bachelor,New York,35000
3,4,Michael Bryant,30,Male,Finance,Analyst,9,Bachelor,New York,75000
4,5,Paula Douglas,25,Female,HR,Analyst,2,Master,Seattle,70000


In [3]:
df.drop('Name',axis=1,inplace=True)

In [4]:
df.isnull().sum()

Unnamed: 0,0
Employee_ID,0
Age,0
Gender,0
Department,0
Job_Title,0
Experience_Years,0
Education_Level,0
Location,0
Salary,0


# Ordinal Encoding

In [5]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,:8],df.iloc[:,-1],test_size=0.2)

In [17]:
# Applying Ordinal Encoding on job_title
oe_job_title = OrdinalEncoder(categories=[['Intern','Analyst','Engineer','Executive','Manager']])
X_train_oe = oe_job_title.fit_transform(X_train['Job_Title'].values.reshape(X_train.shape[0],1))
X_test_oe = oe_job_title.transform(X_test['Job_Title'].values.reshape(X_test.shape[0],1))

# Applying Ordinal Encoding on education_level column
oe_education_level = OrdinalEncoder(categories=[['Bachelor', 'Master', 'PhD']])
X_train_oe = oe_education_level.fit_transform(X_train['Education_Level'].values.reshape(X_train.shape[0],1))
X_test_oe = oe_education_level.transform(X_test['Education_Level'].values.reshape(X_test.shape[0],1))

In [15]:
X_train['Education_Level'].unique()

array(['Bachelor', 'Master', 'PhD'], dtype=object)

# One Hot Encoding

In [19]:
ohe = OneHotEncoder(sparse_output=False,handle_unknown='ignore',drop='first')
X_train_new = ohe.fit_transform(X_train[['Gender','Department','Location']])
X_test_new = ohe.transform(X_test[['Gender','Department','Location']])

In [22]:
np.concatenate((X_train.loc[:,['Employee_ID','Age','Experience_Years',]],X_train_oe,X_train_new),axis=1)
np.concatenate((X_test.loc[:,['Employee_ID','Age','Experience_Years',]],X_test_oe,X_test_new),axis=1)

array([[5.068e+03, 2.800e+01, 7.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [7.246e+03, 3.700e+01, 1.000e+01, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [2.258e+03, 4.700e+01, 2.000e+01, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       ...,
       [8.210e+03, 6.000e+01, 3.700e+01, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.258e+03, 3.100e+01, 8.000e+00, ..., 1.000e+00, 0.000e+00,
        0.000e+00],
       [2.429e+03, 2.500e+01, 4.000e+00, ..., 1.000e+00, 0.000e+00,
        0.000e+00]])

### Label encoder is similar but only apply on output column.