In [63]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier

d1 = {'age':[25, 30, np.nan, 40, 45, np.nan, 50, 55, 60, 65, np.nan, 70],
      'gender':['M', np.nan, 'M', 'F', 'M', np.nan, 'M', np.nan, 'M', 'F', np.nan, 'F'],
      'income':[50000, 60000, 70000, 80000, 90000, 100000, 110000, 120000, 130000, 140000, 150000, 160000]}


In [64]:
df = pd.DataFrame(d1)
df

Unnamed: 0,age,gender,income
0,25.0,M,50000
1,30.0,,60000
2,,M,70000
3,40.0,F,80000
4,45.0,M,90000
5,,,100000
6,50.0,M,110000
7,55.0,,120000
8,60.0,M,130000
9,65.0,F,140000


In [65]:
X = df.iloc[:, 0:2]  # Select all rows and the first two columns (age and  gender) 
y = df.iloc[:, 2]    # Select all rows and the third column (income)
X # Fill NaN values in X with the mean of each column

Unnamed: 0,age,gender
0,25.0,M
1,30.0,
2,,M
3,40.0,F
4,45.0,M
5,,
6,50.0,M
7,55.0,
8,60.0,M
9,65.0,F


In [66]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19)

In [67]:
X_train.shape

(9, 2)

In [68]:
numerical_features = ['age']


In [69]:
categorical_features = ['gender']

In [70]:
num_pipeline = Pipeline(steps= [
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [71]:
cat_pipeline = Pipeline(steps= [
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


In [75]:
col_transformer = ColumnTransformer(
    transformers=[
        ('num_pipeline', num_pipeline, numerical_features),
        ('cat_pipeline', cat_pipeline, categorical_features)
    ],
    remainder='drop',
    n_jobs=-1
)

In [76]:
dtc = DecisionTreeClassifier(random_state=42)
pipeline = make_pipeline(col_transformer, dtc)
pipeline.fit(X_train, y_train)

In [74]:
pipeline.score(X_train, y_train), pipeline.score(X_test, y_test)

(0.7777777777777778, 0.0)