In [None]:
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder

# Without Transformer Applied

In [None]:
df = pd.read_csv('/content/Employers_data.csv')
df.head()

Unnamed: 0,Employee_ID,Name,Age,Gender,Department,Job_Title,Experience_Years,Education_Level,Location,Salary
0,1,Merle Ingram,24,Female,Engineering,Engineer,1,Master,Austin,90000
1,2,John Mayes,56,Male,Sales,Executive,33,Master,Seattle,195000
2,3,Carlos Wille,21,Male,Engineering,Intern,1,Bachelor,New York,35000
3,4,Michael Bryant,30,Male,Finance,Analyst,9,Bachelor,New York,75000
4,5,Paula Douglas,25,Female,HR,Analyst,2,Master,Seattle,70000


In [None]:
# Removing Employee_ID and Name Columns
df.drop(['Employee_ID','Name'],axis=1,inplace=True)

In [None]:
df

Unnamed: 0,Age,Gender,Department,Job_Title,Experience_Years,Education_Level,Location,Salary
0,24,Female,Engineering,Engineer,1,Master,Austin,90000
1,56,Male,Sales,Executive,33,Master,Seattle,195000
2,21,Male,Engineering,Intern,1,Bachelor,New York,35000
3,30,Male,Finance,Analyst,9,Bachelor,New York,75000
4,25,Female,HR,Analyst,2,Master,Seattle,70000
...,...,...,...,...,...,...,...,...
9995,51,Male,Sales,Executive,28,Master,Austin,185000
9996,36,Female,Sales,Executive,9,PhD,San Francisco,165000
9997,57,Male,Sales,Executive,30,PhD,New York,200000
9998,37,Female,Engineering,Manager,14,Master,New York,135000


In [None]:
# Performing train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,:7],df.iloc[:,-1],test_size=0.2,random_state=45)

In [None]:
X_train

Unnamed: 0,Age,Gender,Department,Job_Title,Experience_Years,Education_Level,Location
4643,25,Female,Finance,Analyst,4,Bachelor,Seattle
2390,48,Male,Engineering,Manager,25,Master,Austin
6162,25,Male,Finance,Analyst,2,Master,Seattle
8678,46,Male,Engineering,Manager,19,PhD,Seattle
6944,43,Female,Marketing,Manager,20,Master,Seattle
...,...,...,...,...,...,...,...
8772,30,Female,HR,Analyst,9,Bachelor,Seattle
163,27,Male,Finance,Analyst,4,Master,Seattle
6012,36,Male,Marketing,Manager,9,PhD,Chicago
6558,26,Female,Marketing,Analyst,3,Master,Seattle


In [27]:
# Applying Ordinal Encoding on Job_Title and Education_Level columns
oe_job_title = OrdinalEncoder(categories=[['Intern','Analyst','Engineer','Executive','Manager']])
X_train_job_oe = oe_job_title.fit_transform(X_train['Job_Title'].values.reshape(X_train.shape[0],1))
X_test_job_oe = oe_job_title.transform(X_test['Job_Title'].values.reshape(X_test.shape[0],1))

oe_education_level = OrdinalEncoder(categories=[['Bachelor', 'Master', 'PhD']])
X_train_education_oe = oe_education_level.fit_transform(X_train['Education_Level'].values.reshape(X_train.shape[0],1))
X_test_education_oe = oe_education_level.transform(X_test['Education_Level'].values.reshape(X_test.shape[0],1))

In [29]:
# Applying One Hot Encoding on Gender, Department,Location columns
ohe = OneHotEncoder(sparse_output=False,handle_unknown='ignore',drop='first')
X_train_ohe = ohe.fit_transform(X_train[['Gender', 'Department','Location']])
X_test_ohe = ohe.transform(X_test[['Gender', 'Department','Location']])

In [31]:
np.concatenate((X_train.loc[:,['Age','Experience_Years']],X_train_job_oe,X_train_education_oe,X_train_ohe),axis=1)
np.concatenate((X_test.loc[:,['Age','Experience_Years']],X_test_job_oe,X_test_education_oe,X_test_ohe),axis=1)

array([[49., 22.,  4., ...,  0.,  1.,  0.],
       [24.,  0.,  0., ...,  1.,  0.,  0.],
       [29.,  8.,  1., ...,  0.,  0.,  1.],
       ...,
       [28.,  7.,  1., ...,  0.,  0.,  0.],
       [49., 22.,  4., ...,  0.,  0.,  1.],
       [29.,  8.,  1., ...,  0.,  1.,  0.]])

# With Column Transformer

In [32]:
ct = ColumnTransformer([
    ('ordinal_job',OrdinalEncoder(categories=[['Intern','Analyst','Engineer','Executive','Manager']]),['Job_Title']),
    ('ordinal_education',OrdinalEncoder(categories=[['Bachelor', 'Master', 'PhD']]),['Education_Level']),
    ('ohe',OneHotEncoder(sparse_output=False,drop='first',handle_unknown='ignore'),['Gender', 'Department','Location'])
],remainder='passthrough')

In [33]:
ct.fit_transform(X_train)

array([[ 1.,  0.,  0., ...,  1., 25.,  4.],
       [ 4.,  1.,  1., ...,  0., 48., 25.],
       [ 1.,  1.,  1., ...,  1., 25.,  2.],
       ...,
       [ 4.,  2.,  1., ...,  0., 36.,  9.],
       [ 1.,  1.,  0., ...,  1., 26.,  3.],
       [ 1.,  0.,  0., ...,  0., 30.,  9.]])

In [34]:
ct.transform(X_test)

array([[ 4.,  2.,  1., ...,  0., 49., 22.],
       [ 0.,  0.,  0., ...,  0., 24.,  0.],
       [ 1.,  0.,  0., ...,  1., 29.,  8.],
       ...,
       [ 1.,  0.,  0., ...,  0., 28.,  7.],
       [ 4.,  2.,  0., ...,  1., 49., 22.],
       [ 1.,  0.,  1., ...,  0., 29.,  8.]])