<a href="https://colab.research.google.com/github/JamshedAli18/Data-Preprocessing-Scikit-learn/blob/main/ColumnTransformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Create a DataFrame with realistic data and missing values
data = {
    'education_level': ['high school', 'bachelor', 'master', 'bachelor', 'bachelor', 'high school', 'phd', 'master', 'high school', 'phd', 'bachelor', 'bachelor'],
    'employment_status': ['employed', 'unemployed', 'employed', 'student', 'employed', 'unemployed', 'employed', 'student', 'unemployed', 'employed', 'student', 'employed'],
    'income': [40000, 50000, 60000, 15000, 52000, 30000, 70000, 60000, 35000, 75000, 48000, 55000],
    'age': [25, 35, 45, 22, 32, np.nan, 50, 27, 40, np.nan, 29, 38]
}

df = pd.DataFrame(data)

In [5]:
df.head()

Unnamed: 0,education_level,employment_status,income,age
0,high school,employed,40000,25.0
1,bachelor,unemployed,50000,35.0
2,master,employed,60000,45.0
3,bachelor,student,15000,22.0
4,bachelor,employed,52000,32.0


In [16]:
df['education_level'].value_counts()

education_level
bachelor       5
high school    3
master         2
phd            2
Name: count, dtype: int64

In [24]:
df['employment_status'].value_counts()

employment_status
employed      6
unemployed    3
student       3
Name: count, dtype: int64

In [8]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [6]:
from sklearn.compose import ColumnTransformer

In [17]:
transformer = ColumnTransformer(transformers=[
    ('trf1',OrdinalEncoder(categories=[['high school','bachelor','master','phd']]),[0]),
    ('trf2',OneHotEncoder(sparse=False,drop='first'),[1]),
    ('trf3',SimpleImputer(),[3])
],remainder='passthrough')

In [18]:
encoded_df = transformer.fit_transform(df)



In [19]:
encoded_df

array([[0.00e+00, 0.00e+00, 0.00e+00, 2.50e+01, 4.00e+04],
       [1.00e+00, 0.00e+00, 1.00e+00, 3.50e+01, 5.00e+04],
       [2.00e+00, 0.00e+00, 0.00e+00, 4.50e+01, 6.00e+04],
       [1.00e+00, 1.00e+00, 0.00e+00, 2.20e+01, 1.50e+04],
       [1.00e+00, 0.00e+00, 0.00e+00, 3.20e+01, 5.20e+04],
       [0.00e+00, 0.00e+00, 1.00e+00, 3.43e+01, 3.00e+04],
       [3.00e+00, 0.00e+00, 0.00e+00, 5.00e+01, 7.00e+04],
       [2.00e+00, 1.00e+00, 0.00e+00, 2.70e+01, 6.00e+04],
       [0.00e+00, 0.00e+00, 1.00e+00, 4.00e+01, 3.50e+04],
       [3.00e+00, 0.00e+00, 0.00e+00, 3.43e+01, 7.50e+04],
       [1.00e+00, 1.00e+00, 0.00e+00, 2.90e+01, 4.80e+04],
       [1.00e+00, 0.00e+00, 0.00e+00, 3.80e+01, 5.50e+04]])

In [21]:

feature_names = transformer.get_feature_names_out()
feature_names


array(['trf1__education_level', 'trf2__employment_status_student',
       'trf2__employment_status_unemployed', 'trf3__age',
       'remainder__income'], dtype=object)

In [22]:
final_encoded_df = pd.DataFrame(encoded_df,columns = feature_names)

In [23]:
final_encoded_df

Unnamed: 0,trf1__education_level,trf2__employment_status_student,trf2__employment_status_unemployed,trf3__age,remainder__income
0,0.0,0.0,0.0,25.0,40000.0
1,1.0,0.0,1.0,35.0,50000.0
2,2.0,0.0,0.0,45.0,60000.0
3,1.0,1.0,0.0,22.0,15000.0
4,1.0,0.0,0.0,32.0,52000.0
5,0.0,0.0,1.0,34.3,30000.0
6,3.0,0.0,0.0,50.0,70000.0
7,2.0,1.0,0.0,27.0,60000.0
8,0.0,0.0,1.0,40.0,35000.0
9,3.0,0.0,0.0,34.3,75000.0


In [25]:
final_encoded_df.shape

(12, 5)