    Task: Complete Pipeline for a Dataset
1. Objective: Build a complex pipeline with multiple transformations.
2. Steps:
    - Load a sample dataset.
    - Define a transformation pipeline with both imputation and scaling.

In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

def scale_data(data):
    """
    Scale numerical data using StandardScaler.
    Non-numeric columns are skipped and returned unchanged.
    
    Args:
        data (pd.DataFrame or pd.Series or np.ndarray): Input data to scale.
        
    Returns:
        Same type as input: Data with numerical columns scaled, non-numerical untouched.
    """
    if isinstance(data, pd.Series):
        if pd.api.types.is_numeric_dtype(data):
            scaler = StandardScaler()
            scaled_values = scaler.fit_transform(data.values.reshape(-1, 1))
            return pd.Series(scaled_values.flatten(), index=data.index)
        else:
            print("Warning: Series is non-numeric. Returning original data.")
            return data

    elif isinstance(data, pd.DataFrame):
        numeric_cols = data.select_dtypes(include=[np.number]).columns
        non_numeric_cols = data.columns.difference(numeric_cols)
        
        if len(numeric_cols) == 0:
            print("Warning: No numeric columns to scale. Returning original DataFrame.")
            return data.copy()
        
        scaler = StandardScaler()
        scaled_numeric = scaler.fit_transform(data[numeric_cols])
        scaled_df = pd.DataFrame(scaled_numeric, columns=numeric_cols, index=data.index)
        
        # Concatenate non-numeric columns back unchanged
        if len(non_numeric_cols) > 0:
            non_numeric_df = data[non_numeric_cols].copy()
            result = pd.concat([scaled_df, non_numeric_df], axis=1)
            # Preserve original column order
            return result[data.columns]
        else:
            return scaled_df

    else:
        # Assume numpy array: scale all (numeric) values
        scaler = StandardScaler()
        try:
            return scaler.fit_transform(data)
        except Exception as e:
            raise ValueError(f"Error scaling numpy array: {e}")

In [5]:

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

def impute_and_scale(data, impute_strategy='mean'):
    try:
        pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy=impute_strategy)),
            ('scaler', StandardScaler())
        ])
        
        if isinstance(data, pd.Series):
            if not pd.api.types.is_numeric_dtype(data):
                raise TypeError("Input Series must be numeric for imputation and scaling.")
            arr = data.values.reshape(-1, 1)
            transformed = pipeline.fit_transform(arr)
            return pd.Series(transformed.flatten(), index=data.index)

        elif isinstance(data, pd.DataFrame):
            numeric_cols = data.select_dtypes(include=[np.number]).columns
            if len(numeric_cols) == 0:
                raise ValueError("No numeric columns to impute and scale.")
            transformed = pipeline.fit_transform(data[numeric_cols])
            scaled_df = pd.DataFrame(transformed, columns=numeric_cols, index=data.index)

            non_numeric_cols = data.columns.difference(numeric_cols)
            if len(non_numeric_cols) > 0:
                result = pd.concat([scaled_df, data[non_numeric_cols]], axis=1)
                return result[data.columns]
            else:
                return scaled_df

        else:
            return pipeline.fit_transform(data)

    except Exception as e:
        print(f"Error in impute_and_scale: {e}")
        raise





