In [169]:
import pandas as pd

data = {"Name": ["Harry", "Hermioni", "Ron", "Drago", "Jinny", "Luna"],
        "Age": [25, 33, 22, None, 33, 42],
        "Gender": ["M", "F", "M", "M", "F", "F"],
        "Job": ["Machine Learning Engineer", "DevOps", "Software Developmen", "Data Analysis", "Data Science", "AI/ML Engineer"]}


df = pd.DataFrame(data)

In [170]:
df

Unnamed: 0,Name,Age,Gender,Job
0,Harry,25.0,M,Machine Learning Engineer
1,Hermioni,33.0,F,DevOps
2,Ron,22.0,M,Software Developmen
3,Drago,,M,Data Analysis
4,Jinny,33.0,F,Data Science
5,Luna,42.0,F,AI/ML Engineer


Preprocessing Pipeline:

* Drop Name Feature
* Impute Ages
* Turn Gender into Binary / Numeric
* One Hot Encode Jobs

In [171]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Drop Name Feature
df = df.drop(["Name"], axis=1)

# Impute Ages
imputer = SimpleImputer(strategy="mean")
df['Age'] = imputer.fit_transform(df[['Age']])

# Numeric Gender
gender_lis = {"m": 0, "f": 1}
df['Gender'] = [gender_lis[g.lower()] for g in df['Gender']]

# OneHotEncode Jobs
encrypt = OneHotEncoder()
matrix = encrypt.fit_transform(df[['Job']]).toarray()

# Get the unique job names from the 'encoder' object
column_names = encrypt.get_feature_names_out(['Job'])

# Attach the new columns to the dataframe
for i in range(len(matrix.T)):
    df[column_names[i]] = matrix.T[i]

# Drop the original 'Job' column
df = df.drop(['Job'], axis=1)

In [172]:
df

Unnamed: 0,Age,Gender,Job_AI/ML Engineer,Job_Data Analysis,Job_Data Science,Job_DevOps,Job_Machine Learning Engineer,Job_Software Developmen
0,25.0,0,0.0,0.0,0.0,0.0,1.0,0.0
1,33.0,1,0.0,0.0,0.0,1.0,0.0,0.0
2,22.0,0,0.0,0.0,0.0,0.0,0.0,1.0
3,31.0,0,0.0,1.0,0.0,0.0,0.0,0.0
4,33.0,1,0.0,0.0,1.0,0.0,0.0,0.0
5,42.0,1,1.0,0.0,0.0,0.0,0.0,0.0


In [173]:
from sklearn.base import BaseEstimator, TransformerMixin

class NameDropper(BaseEstimator, TransformerMixin) :
    def fit(self, X, y=None):
        return self
    
    def transform(self, X) :
        return X.drop(['Name'], axis=1)
    
class AgeImputer(BaseEstimator, TransformerMixin) :
    def fit(self, X, y=None):
        return self
    
    def transform(self, X) :
        imputer = SimpleImputer(strategy="mean")
        X['Age'] = imputer.fit_transform(X[['Age']])
        return X

class FeatureEncrypt(BaseEstimator, TransformerMixin) :
        def fit(self, X, y=None):
            return self
    
        def transform(self, X) :
            gender_lis = {"M": 0, "F": 1}
            X['Gender'] = [gender_lis[g] for g in X['Gender']]

            encrypt = OneHotEncoder()
            matrix = encrypt.fit_transform(X[['Job']]).toarray()

            column_names = encrypt.get_feature_names_out(['Job'])

            for i in range(len(matrix.T)):
                X[column_names[i]] = matrix.T[i]

            return X.drop(['Job'], axis=1)


In [174]:
data = {"Name": ["Luffy", "Zolo", "Nami", "Sanji", "Chopper"],
        "Age": [19, 21, 20, 21, None],
        "Gender": ["M", "M", "F", "M", "M"],
        "Job": ["Something Adventurous", "Swords", "Navigator", "Cook", "Doctor"]}

df2 = pd.DataFrame(data) 
df2       

Unnamed: 0,Name,Age,Gender,Job
0,Luffy,19.0,M,Something Adventurous
1,Zolo,21.0,M,Swords
2,Nami,20.0,F,Navigator
3,Sanji,21.0,M,Cook
4,Chopper,,M,Doctor


In [175]:
df2

dropper = NameDropper()
dropper.fit_transform(df2)

Unnamed: 0,Age,Gender,Job
0,19.0,M,Something Adventurous
1,21.0,M,Swords
2,20.0,F,Navigator
3,21.0,M,Cook
4,,M,Doctor


In [176]:
dropper = NameDropper()
imp = AgeImputer()
enc = FeatureEncrypt()

enc.fit_transform(imp.fit_transform(dropper.fit_transform(df2)))

Unnamed: 0,Age,Gender,Job_Cook,Job_Doctor,Job_Navigator,Job_Something Adventurous,Job_Swords
0,19.0,0,0.0,0.0,0.0,1.0,0.0
1,21.0,0,0.0,0.0,0.0,0.0,1.0
2,20.0,1,0.0,0.0,1.0,0.0,0.0
3,21.0,0,1.0,0.0,0.0,0.0,0.0
4,20.25,0,0.0,1.0,0.0,0.0,0.0


In [177]:
from sklearn.pipeline import Pipeline

pipe = Pipeline({
    ("dropper", NameDropper()),
    ("imputer", AgeImputer()), 
    ("encrypt", FeatureEncrypt())   
})

pipe.fit_transform(df2)

Unnamed: 0,Age,Gender,Job_Cook,Job_Doctor,Job_Navigator,Job_Something Adventurous,Job_Swords
0,19.0,0,0.0,0.0,0.0,1.0,0.0
1,21.0,0,0.0,0.0,0.0,0.0,1.0
2,20.0,1,0.0,0.0,1.0,0.0,0.0
3,21.0,0,1.0,0.0,0.0,0.0,0.0
4,20.25,0,0.0,1.0,0.0,0.0,0.0
