In [10]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Step 1: Create a sample dataset
data = {
    'Age': [25, 30, 45, 35, np.nan, 40],
    'Salary': [50000, 60000, 80000, 70000, 65000, np.nan],
    'Gender': ['Male', 'Female', 'Female', 'Male', 'Female', 'Male'],
    'Department': ['HR', 'Finance', 'IT', 'Marketing', 'Finance', 'IT']
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# Step 2: Handle missing values
# Impute numerical features with mean
num_imputer = SimpleImputer(strategy='mean')
df[['Age', 'Salary']] = num_imputer.fit_transform(df[['Age', 'Salary']])

# Step 3: Encode categorical variables
# Label Encoding for 'Gender' (binary categorical variable)
le_gender = LabelEncoder()
df['Gender_LabelEncoded'] = le_gender.fit_transform(df['Gender'])

# One-Hot Encoding for 'Department' (multiclass categorical variable)
ohe = OneHotEncoder(sparse=False, drop='first')  # drop='first' to avoid dummy variable trap
department_encoded = ohe.fit_transform(df[['Department']])
department_encoded_df = pd.DataFrame(department_encoded, columns=ohe.get_feature_names_out(['Department']))
df = pd.concat([df, department_encoded_df], axis=1)

# Step 4: Feature Scaling
# Standardization (Z-score normalization) for 'Age' and 'Salary'
scaler = StandardScaler()
df[['Age_Standardized', 'Salary_Standardized']] = scaler.fit_transform(df[['Age', 'Salary']])

# Min-Max Scaling for 'Age' and 'Salary'
minmax_scaler = MinMaxScaler()
df[['Age_MinMaxScaled', 'Salary_MinMaxScaled']] = minmax_scaler.fit_transform(df[['Age', 'Salary']])

# Step 5: Display the processed DataFrame
print("\nProcessed DataFrame:")
print(df)

# Optional: Drop intermediate columns if needed
# df.drop(['Gender', 'Department'], axis=1, inplace=True)

Original DataFrame:
    Age   Salary  Gender Department
0  25.0  50000.0    Male         HR
1  30.0  60000.0  Female    Finance
2  45.0  80000.0  Female         IT
3  35.0  70000.0    Male  Marketing
4   NaN  65000.0  Female    Finance
5  40.0      NaN    Male         IT


TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'