    Task: Complete Pipeline for a Dataset
1. Objective: Build a complex pipeline with multiple transformations.
2. Steps:
    - Load a sample dataset.
    - Define a transformation pipeline with both imputation and scaling.

In [1]:
# Write your code from here
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Step 1: Load a sample dataset with missing values
data = {
    'age': [25, np.nan, 47, 51, 62],
    'income': [50000, 64000, np.nan, 110000, 150000],
    'gender': ['M', 'F', 'F', 'M', 'M']  # categorical column, ignored here
}
df = pd.DataFrame(data)

# Step 2: Select numerical columns only
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Step 3: Define a pipeline with imputation and scaling
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Fill missing values with mean
    ('scaler', StandardScaler())                   # Scale numerical features
])

# Step 4: Fit and transform the numerical data
processed_data = pipeline.fit_transform(df[numerical_cols])

# Optional: Convert the processed data back to a DataFrame
processed_df = pd.DataFrame(processed_data, columns=numerical_cols)

print(processed_df)


        age    income
0 -1.767461 -1.232636
1  0.000000 -0.835926
2  0.062381  0.000000
3  0.395080  0.467552
4  1.310001  1.601010


In [2]:
# Task: Imputation Function
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

def impute_missing_values(data, strategy='mean'):
    """
    Impute missing values in data using the specified strategy.
    
    Args:
        data (np.ndarray or pd.Series or pd.DataFrame): Input data with missing values.
        strategy (str): Imputation strategy - 'mean', 'median', 'most_frequent', or 'constant'.
        
    Returns:
        np.ndarray or pd.DataFrame: Data with missing values imputed.
    """
    imputer = SimpleImputer(strategy=strategy)
    
    if isinstance(data, pd.Series) or isinstance(data, pd.DataFrame):
        imputed_array = imputer.fit_transform(data)
        if isinstance(data, pd.Series):
            return pd.Series(imputed_array.flatten(), index=data.index)
        else:
            return pd.DataFrame(imputed_array, columns=data.columns, index=data.index)
    else:
        # assume numpy array
        return imputer.fit_transform(data)








# Scaling Function
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

def scale_data(data):
    """
    Scale numerical data using StandardScaler.
    
    Args:
        data (np.ndarray or pd.Series or pd.DataFrame): Numerical data to scale.
        
    Returns:
        np.ndarray or pd.Series or pd.DataFrame: Scaled data with the same type and shape as input.
    """
    scaler = StandardScaler()
    
    if isinstance(data, pd.Series):
        scaled_array = scaler.fit_transform(data.values.reshape(-1, 1))
        return pd.Series(scaled_array.flatten(), index=data.index)
    
    elif isinstance(data, pd.DataFrame):
        scaled_array = scaler.fit_transform(data)
        return pd.DataFrame(scaled_array, columns=data.columns, index=data.index)
    
    else:
        # Assume numpy array
        return scaler.fit_transform(data)









# Combined Transformation Function
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

def impute_and_scale(data, impute_strategy='mean'):
    """
    Impute missing values and scale numerical data.
    
    Args:
        data (np.ndarray or pd.Series or pd.DataFrame): Numerical data with possible missing values.
        impute_strategy (str): Strategy for SimpleImputer ('mean', 'median', 'most_frequent', 'constant').
        
    Returns:
        Same type as input: Data with imputed missing values and scaled.
    """
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy=impute_strategy)),
        ('scaler', StandardScaler())
    ])
    
    # Transform data accordingly and preserve type & shape
    if isinstance(data, pd.Series):
        arr = data.values.reshape(-1, 1)
        transformed = pipeline.fit_transform(arr)
        return pd.Series(transformed.flatten(), index=data.index)
    
    elif isinstance(data, pd.DataFrame):
        transformed = pipeline.fit_transform(data)
        return pd.DataFrame(transformed, columns=data.columns, index=data.index)
    
    else:
        # Assume numpy array
        return pipeline.fit_transform(data)







