In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [None]:
# Creating a synthetic dataset
data = {
    'age': [25, np.nan, 35, 45, 50],
    'salary': [50000, 60000, np.nan, 80000, 85000],
    'city': ['New York', 'Los Angeles', 'New York', 'San Francisco', np.nan],
    'purchased': ['No', 'Yes', 'No', 'Yes', 'No']
}

In [None]:
df = pd.DataFrame(data)

In [None]:
# Separating features and target variable
X = df.drop('purchased', axis=1)
y = df['purchased']

In [None]:
# Define which columns are numeric and which are categorical
numeric_features = ['age', 'salary']
categorical_features = ['city']


In [None]:
# Creating transformers for the preprocessing steps
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [None]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# Combining transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
# Preprocessing the dataset
X_preprocessed = preprocessor.fit_transform(X)


In [None]:
# Showing the result of preprocessing
print("Preprocessed Features:")
print(X_preprocessed.toarray() if hasattr(X_preprocessed, "toarray") else X_preprocessed)


Preprocessed Features:
[[-1.6011119  -1.46524625  0.          1.          0.        ]
 [ 0.         -0.68378158  1.          0.          0.        ]
 [-0.43666688  0.          0.          1.          0.        ]
 [ 0.72777814  0.87914775  0.          0.          1.        ]
 [ 1.31000065  1.26988008  0.          1.          0.        ]]


In [None]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)


In [None]:
# Printing the split datasets
print("\nTraining Features:")
print(X_train)
print("\nTesting Features:")
print(X_test)
print("\nTraining Target:")
print(y_train)
print("\nTesting Target:")
print(y_test)


Training Features:
[[ 1.31000065  1.26988008  0.          1.          0.        ]
 [-0.43666688  0.          0.          1.          0.        ]
 [-1.6011119  -1.46524625  0.          1.          0.        ]
 [ 0.72777814  0.87914775  0.          0.          1.        ]]

Testing Features:
[[ 0.         -0.68378158  1.          0.          0.        ]]

Training Target:
4     No
2     No
0     No
3    Yes
Name: purchased, dtype: object

Testing Target:
1    Yes
Name: purchased, dtype: object
