In [None]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Step 2: Create sample data with missing values
data = {
    'age': [25, 30, np.nan, 45, 50],
    'salary': [50000, 60000, 55000, None, 80000],
    'gender': ['male', 'female', 'female', 'male', None],
    'department': ['HR', 'Finance', 'IT', None, 'IT'],
    'target': [1, 0, 1, 0, 1]
}
df = pd.DataFrame(data)

# Step 3: Split features and target
X = df.drop("target", axis=1)
y = df["target"]

# Step 4: Identify column types
numeric_features = ['age', 'salary']
categorical_features = ['gender', 'department']

# Step 5: Define preprocessing pipelines
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Step 6: Combine into ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_pipeline, numeric_features),
    ('cat', categorical_pipeline, categorical_features)
])

# Step 7: Create full pipeline with classifier
clf_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Step 8: Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 9: Train the pipeline
clf_pipeline.fit(X_train, y_train)

# Step 10: Evaluate on test data
y_pred = clf_pipeline.predict(X_test)
print("Classification Report:\n")
print(classification_report(y_test, y_pred))
