In [7]:


import pandas as pd
from sklearn.datasets import load_iris
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Step 1: Extract - Load Data
def extract_data():
    iris = load_iris(as_frame=True)
    df = iris.frame
    
    print("\n--- Raw Data Sample ---")
    print(df.head())  # Show first 5 rows of raw data
    
    print("\n--- Missing Values Before Processing ---")
    print(df.isnull().sum())  # Check missing values
    
    return df

# Step 2: Transform - Preprocessing Function
def transform_data(df):
    # Introduce missing values for demonstration
    df.iloc[0, 0] = None  # Set a value in the first column to NaN

    # Identify numerical and categorical columns
    num_features = df.select_dtypes(include=['float64', 'int64']).columns
    cat_features = df.select_dtypes(include=['object']).columns

    # Pipelines for transformation
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="mean")),  # Fill missing values
        ('scaler', StandardScaler())                 # Scale features
    ])

    cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="most_frequent")),  # Fill missing categorical values
        ('encoder', OneHotEncoder(handle_unknown='ignore'))    # One-hot encode categorical data
    ])

    # Combine transformations
    preprocessor = ColumnTransformer([
        ('num', num_pipeline, num_features),
        ('cat', cat_pipeline, cat_features)
    ])

    # Apply transformations
    transformed_data = preprocessor.fit_transform(df)
    
    # Convert back to DataFrame
    transformed_df = pd.DataFrame(transformed_data)
    
    print("\n--- Missing Values After Processing ---")
    print(transformed_df.isnull().sum())  # Check if missing values are handled
    
    print("\n--- Transformed Data Sample ---")
    print(transformed_df.head())  # Show first 5 rows of transformed data
    
    return transformed_df

# Step 3: Load - Save the cleaned data
def load_data(df, output_path):
    df.to_csv(output_path, index=False)
    print(f"\n Preprocessed Data Saved to {output_path}")

# Run the ETL Pipeline
if __name__ == "__main__":  
    output_file = "processed_iris_data.csv"

    # ETL Process
    raw_df = extract_data()
    cleaned_df = transform_data(raw_df)
    load_data(cleaned_df, output_file)



--- Raw Data Sample ---
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  

--- Missing Values Before Processing ---
sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
dtype: int64

--- Missing Values After Processing ---
0    0
1    0
2    0
3    0
dtype: int64

--- Transformed Data Sample ---
          0         1         2         3
0  0.000000  1.019004 -1.340227 -1.315444
1 -1.152203 -0.131979 -1.340227 -1.315444
2 -1.395201  0.32