In [1]:
print("Bismillahir Rahmanir Raheem")

Bismillahir Rahmanir Raheem


In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

Data Loaded Successfully!
Data Preprocessing Complete!
Processed data saved to processed_data.csv


In [None]:
# 1. Extract: Load dataset
def extract_data(file_path):
    df = pd.read_csv(file_path)
    print("Data Loaded Successfully!")
    return df

In [None]:
# 2. Preprocessing and Transformation
def preprocess_data(df):
    # Identify numeric and categorical columns
    numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = df.select_dtypes(include=['object']).columns

    # Create transformation pipelines
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),  # Fill missing values with mean
        ('scaler', StandardScaler())  # Normalize numeric data
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing categorical values
        ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encoding
    ])

    # Apply transformations
    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    transformed_data = preprocessor.fit_transform(df)

    # Convert transformed data back to a DataFrame
    transformed_df = pd.DataFrame(transformed_data)

    print("Data Preprocessing Complete!")
    return transformed_df

In [None]:
# 3. Load: Save processed data
def load_data(df, output_path):
    df.to_csv(output_path, index=False)
    print(f"Processed data saved to {output_path}")

In [None]:
# Run the ETL Pipeline
if __name__ == "__main__":
    input_file = "/content/sample_data/california_housing_train.csv"  # Change this to your dataset path
    output_file = "processed_data.csv"

    # Run ETL
    raw_data = extract_data(input_file)
    processed_data = preprocess_data(raw_data)
    load_data(processed_data, output_file)

# To use the script

In [None]:
# run this block of code only if you are using pipeline as py script
# !python etl_pipeline.py
