    Task: Complete Pipeline for a Dataset
1. Objective: Build a complex pipeline with multiple transformations.
2. Steps:
    - Load a sample dataset.
    - Define a transformation pipeline with both imputation and scaling.

In [11]:
import pytest
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Test for Imputation
def test_imputation():
    # Sample data with missing values
    data = {'A': [1, 2, np.nan, 4], 'B': [5, np.nan, 7, 8]}
    df = pd.DataFrame(data)
    
    imputer = SimpleImputer(strategy='mean')
    imputed_data = imputer.fit_transform(df)
    
    # Assert that missing values are replaced
    assert not np.any(np.isnan(imputed_data)), "Missing values exist after imputation!"

# Test for Scaling
def test_scaling():
    # Sample data
    data = {'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]}
    df = pd.DataFrame(data)
    
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df)
    
    # Assert that the scaled data has mean=0 and std=1 for each column
    assert np.abs(np.mean(scaled_data[:, 0])) < 0.1, "Column A mean is not close to 0"  # Column A
    assert np.abs(np.mean(scaled_data[:, 1])) < 0.1, "Column B mean is not close to 0"  # Column B
    assert np.abs(np.std(scaled_data[:, 0]) - 1) < 0.1, "Column A std is not close to 1"
    assert np.abs(np.std(scaled_data[:, 1]) - 1) < 0.1, "Column B std is not close to 1"


In [12]:
# Task: Imputation Function

# Task: Imputation Function



def impute_data(df):
    """Impute missing values in the dataset using SimpleImputer with mean strategy."""
    imputer = SimpleImputer(strategy='mean')  # Impute missing values with the column mean
    imputed_data = imputer.fit_transform(df)
    return pd.DataFrame(imputed_data, columns=df.columns)




# Scaling Function









# Combined Transformation Function


import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_iris

# --- Imputation Function ---
def impute_data(df):
    """Impute missing values in the dataset using SimpleImputer with mean strategy."""
    imputer = SimpleImputer(strategy='mean')  # Impute missing values with the column mean
    imputed_data = imputer.fit_transform(df)
    return pd.DataFrame(imputed_data, columns=df.columns)

# --- Scaling Function ---
def scale_data(df):
    """Scale the numerical data using StandardScaler."""
    scaler = StandardScaler()  # Standardize features by removing the mean and scaling to unit variance
    scaled_data = scaler.fit_transform(df)
    return pd.DataFrame(scaled_data, columns=df.columns)

# --- Combined Transformation Function ---
def combined_transformation(df):
    """Apply both imputation and scaling to the dataset."""
    # Step 1: Impute missing values
    df_imputed = impute_data(df)
    
    # Step 2: Scale the features
    df_scaled = scale_data(df_imputed)
    
    return df_scaled

# --- Example Usage ---
# Load the Iris dataset
data = load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)

# Introduce missing values for demonstration
df.iloc[0, 0] = np.nan  # Missing value in the first feature (Feature1)
df.iloc[5, 2] = np.nan  # Missing value in the third feature (Feature3)

# Display original data with missing values
print("Original Data with Missing Values:")
print(df.head())

# Apply the combined transformation
processed_df = combined_transformation(df)

# Show the processed dataset after imputation and scaling
print("\nProcessed Data (Imputed and Scaled):")
print(processed_df.head())





# Scaling Function









# Combined Transformation Function









Original Data with Missing Values:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                NaN               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2

Processed Data (Imputed and Scaled):
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0           0.000000          1.019004          -1.354309         -1.315444
1          -1.152203         -0.131979          -1.354309         -1.315444
2          -1.395201          0.328414          -1.411410         -1.315444
3          -1.516700          0.098217          -1.297209         -1.315444
4          -1.030704          1.249201          -1.354309         -1.315444
