In [8]:
# Feature Scaling & Encoding - All-in-One Code

# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Step 2: Load Dataset
# Replace 'your_dataset.csv' with your actual file name or path
df = pd.read_csv('your_dataset.csv')
print("Original Data:")
print(df.head())

# Step 3: Identify Feature Types
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Step 4: Define Preprocessing Pipelines
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

# Step 5: Apply Transformation
processed_array = preprocessor.fit_transform(df)

# Get Feature Names
cat_feature_names = preprocessor.named_transformers_['cat']['encoder'].get_feature_names_out(categorical_cols)
all_feature_names = numerical_cols + list(cat_feature_names)

# Convert to DataFrame
processed_df = pd.DataFrame(processed_array.toarray() if hasattr(processed_array, "toarray") else processed_array,
                            columns=all_feature_names)

print("\nProcessed Data:")
print(processed_df.head())

# Step 6: Save the Cleaned Data (Optional)
processed_df.to_csv('processed_data.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'your_dataset.csv'