In [19]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 2: Create sample dataset
data = {
    'age': [25, 30, 45, np.nan, 50],
    'salary': [50000, 60000, 80000, 40000, np.nan],
    'gender': ['male', 'female', 'female', 'male', 'female'],
    'department': ['HR', 'Finance', 'IT', 'Finance', 'IT'],
    'target': [1, 0, 1, 0, 1]
}
df = pd.DataFrame(data)

# Step 3: Define feature and target columns
X = df.drop('target', axis=1)
y = df['target']

# Step 4: Identify column types
numeric_features = ['age', 'salary']
categorical_features = ['gender', 'department']

# Step 5: Create preprocessing pipelines

# Numeric: Impute missing values and scale
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Categorical: Impute and One-Hot Encode
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Step 6: Combine pipelines using ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_pipeline, numeric_features),
    ('cat', categorical_pipeline, categorical_features)
])

# Step 7: Apply transformations
X_preprocessed = preprocessor.fit_transform(X)

# Step 8: Display final transformed features
# Convert back to DataFrame for easier understanding
feature_names = (
    numeric_features +
    list(preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features))
)
X_final = pd.DataFrame(X_preprocessed.toarray() if hasattr(X_preprocessed, "toarray") else X_preprocessed, columns=feature_names)

# Step 9: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)

# Step 10: Show outputs
print("Processed Features (Train):")
print(X_train)
print("\nTarget (Train):")
print(y_train)


Processed Features (Train):
        age    salary  gender_female  gender_male  department_Finance  \
4  1.355815  0.000000            1.0          0.0                 0.0   
2  0.813489  1.700840            1.0          0.0                 0.0   
0 -1.355815 -0.566947            0.0          1.0                 0.0   
3  0.000000 -1.322876            0.0          1.0                 1.0   

   department_HR  department_IT  
4            0.0            1.0  
2            0.0            1.0  
0            1.0            0.0  
3            0.0            0.0  

Target (Train):
4    1
2    1
0    1
3    0
Name: target, dtype: int64
