In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load sample dataset (or replace this with your own CSV later)
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Daisy', 'Ethan'],
    'Age': [25, 30, 28, None, 35],
    'Gender': ['Female', 'Male', None, 'Female', 'Male'],
    'Salary': [50000, 60000, 55000, 52000, None]
}

df = pd.DataFrame(data)
print("Original Data:")
display(df)

# Separate numerical and categorical columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

# Define pipelines
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine pipelines
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

# Apply the pipeline
X_processed = preprocessor.fit_transform(df)

# Convert processed data to DataFrame
import numpy as np

# If output is sparse matrix, convert to array
if hasattr(X_processed, "toarray"):
    X_processed = X_processed.toarray()

processed_df = pd.DataFrame(X_processed)
print("Processed Data:")
display(processed_df)

# Save the processed data to a CSV
processed_df.to_csv("processed_data.csv", index=False)
print("Processed data saved as processed_data.csv")


Original Data:


Unnamed: 0,Name,Age,Gender,Salary
0,Alice,25.0,Female,50000.0
1,Bob,30.0,Male,60000.0
2,Charlie,28.0,,55000.0
3,Daisy,,Female,52000.0
4,Ethan,35.0,Male,


Processed Data:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-1.382164,-1.261511,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.153574,1.70675,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.460721,0.22262,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.0,-0.667859,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,1.689312,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


Processed data saved as processed_data.csv
