    Task: Complete Pipeline for a Dataset
1. Objective: Build a complex pipeline with multiple transformations.
2. Steps:
    - Load a sample dataset.
    - Define a transformation pipeline with both imputation and scaling.

In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

def scale_data(data):
    """
    Scale numerical data using StandardScaler.
    Non-numeric columns are skipped and returned unchanged.
    
    Args:
        data (pd.DataFrame or pd.Series or np.ndarray): Input data to scale.
        
    Returns:
        Same type as input: Data with numerical columns scaled, non-numerical untouched.
    """
    if isinstance(data, pd.Series):
        if pd.api.types.is_numeric_dtype(data):
            scaler = StandardScaler()
            scaled_values = scaler.fit_transform(data.values.reshape(-1, 1))
            return pd.Series(scaled_values.flatten(), index=data.index)
        else:
            print("Warning: Series is non-numeric. Returning original data.")
            return data

    elif isinstance(data, pd.DataFrame):
        numeric_cols = data.select_dtypes(include=[np.number]).columns
        non_numeric_cols = data.columns.difference(numeric_cols)
        
        if len(numeric_cols) == 0:
            print("Warning: No numeric columns to scale. Returning original DataFrame.")
            return data.copy()
        
        scaler = StandardScaler()
        scaled_numeric = scaler.fit_transform(data[numeric_cols])
        scaled_df = pd.DataFrame(scaled_numeric, columns=numeric_cols, index=data.index)
        
        # Concatenate non-numeric columns back unchanged
        if len(non_numeric_cols) > 0:
            non_numeric_df = data[non_numeric_cols].copy()
            result = pd.concat([scaled_df, non_numeric_df], axis=1)
            # Preserve original column order
            return result[data.columns]
        else:
            return scaled_df

    else:
        # Assume numpy array: scale all (numeric) values
        scaler = StandardScaler()
        try:
            return scaler.fit_transform(data)
        except Exception as e:
            raise ValueError(f"Error scaling numpy array: {e}")

In [4]:
import pytest
import numpy as np
import pandas as pd
from sklearn.exceptions import NotFittedError

# Import your functions here, e.g.
# from your_module import scale_data, impute_and_scale

def test_scale_data_numeric_df():
    df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
    scaled = scale_data(df)
    assert isinstance(scaled, pd.DataFrame)
    assert scaled.shape == df.shape
    assert all(abs(scaled.mean()) < 1e-6)

def test_scale_data_mixed_df():
    df = pd.DataFrame({'a': [1, 2, 3], 'b': ['x', 'y', 'z']})
    scaled = scale_data(df)
    assert 'b' in scaled.columns
    # Numeric column scaled, non-numeric unchanged
    assert not np.allclose(scaled['a'], df['a'])
    assert (scaled['b'] == df['b']).all()

def test_scale_data_numeric_series():
    s = pd.Series([10, 20, 30])
    scaled = scale_data(s)
    assert isinstance(scaled, pd.Series)
    assert abs(scaled.mean()) < 1e-6

def test_scale_data_non_numeric_series_warns():
    s = pd.Series(['a', 'b', 'c'])
    result = scale_data(s)
    assert (result == s).all()  # Returned original

def test_scale_data_empty_df():
    df = pd.DataFrame()
    scaled = scale_data(df)
    assert scaled.empty

def test_impute_and_scale_numeric_df():
    df = pd.DataFrame({'a': [1, np.nan, 3], 'b': [4, 5, np.nan]})
    result = impute_and_scale(df)
    assert isinstance(result, pd.DataFrame)
    assert not result.isnull().any().any()  # No missing after imputation
    assert all(abs(result.mean()) < 1e-6)

def test_impute_and_scale_mixed_df():
    df = pd.DataFrame({'a': [1, np.nan, 3], 'b': ['x', 'y', 'z']})
    result = impute_and_scale(df)
    # Numeric columns imputed & scaled
    assert 'a' in result.columns
    # Non-numeric columns unchanged
    assert (result['b'] == df['b']).all()

def test_impute_and_scale_numeric_series():
    s = pd.Series([np.nan, 20, 30])
    result = impute_and_scale(s)
    assert isinstance(result, pd.Series)
    assert not result.isnull().any()
    assert abs(result.mean()) < 1e-6

def test_impute_and_scale_non_numeric_series_raises():
    s = pd.Series(['a', 'b', 'c'])
    with pytest.raises(TypeError):
        impute_and_scale(s)

def test_impute_and_scale_empty_df_raises():
    df = pd.DataFrame()
    with pytest.raises(ValueError):
        impute_and_scale(df)

def test_impute_and_scale_no_numeric_cols_raises():
    df = pd.DataFrame({'a': ['x', 'y', 'z']})
    with pytest.raises(ValueError):
        impute_and_scale(df)

def test_scale_data_numpy_array():
    arr = np.array([[1, 2], [3, 4], [5, 6]])
    scaled = scale_data(arr)
    assert isinstance(scaled, np.ndarray)
    assert scaled.shape == arr.shape

def test_impute_and_scale_numpy_array():
    arr = np.array([[1, 2], [np.nan, 4], [5, 6]])
    transformed = impute_and_scale(arr)
    assert isinstance(transformed, np.ndarray)
    assert not np.isnan(transformed).any()







ModuleNotFoundError: No module named 'pytest'