In [None]:
# Step 1: Import libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Step 2: Load dataset
df = pd.read_csv("your_dataset.csv")  # Replace with your actual dataset path
print("Original Data:")
print(df.head())

# Step 3: Identify numerical and categorical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

# Step 4: Define preprocessing pipelines
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

# Step 5: Combine with ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipeline, numerical_cols),
    ('cat', cat_pipeline, categorical_cols)
])

# Step 6: Apply transformations
X_preprocessed = preprocessor.fit_transform(df)

# Step 7: Convert to DataFrame
encoded_cat_cols = preprocessor.named_transformers_['cat']['encoder'].get_feature_names_out(categorical_cols)
final_columns = numerical_cols + list(encoded_cat_cols)
processed_df = pd.DataFrame(X_preprocessed.toarray(), columns=final_columns)

# Step 8: View and save final processed data
print("Processed Data:")
print(processed_df.head())

processed_df.to_csv("processed_data.csv", index=False)