    Task: Complete Pipeline for a Dataset
1. Objective: Build a complex pipeline with multiple transformations.
2. Steps:
    - Load a sample dataset.
    - Define a transformation pipeline with both imputation and scaling.

In [1]:
# Write your code from here
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Step 1: Load a sample dataset with missing values
data = {
    'age': [25, np.nan, 47, 51, 62],
    'income': [50000, 64000, np.nan, 110000, 150000],
    'gender': ['M', 'F', 'F', 'M', 'M']  # categorical column, ignored here
}
df = pd.DataFrame(data)

# Step 2: Select numerical columns only
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Step 3: Define a pipeline with imputation and scaling
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Fill missing values with mean
    ('scaler', StandardScaler())                   # Scale numerical features
])

# Step 4: Fit and transform the numerical data
processed_data = pipeline.fit_transform(df[numerical_cols])

# Optional: Convert the processed data back to a DataFrame
processed_df = pd.DataFrame(processed_data, columns=numerical_cols)

print(processed_df)


        age    income
0 -1.767461 -1.232636
1  0.000000 -0.835926
2  0.062381  0.000000
3  0.395080  0.467552
4  1.310001  1.601010
