In [19]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Create a sample DataFrame with numerical and categorical data
data = {
    'Age': [25, 30, 35, 40, 45],
    'Salary': [50000, 60000, 70000, 80000, 90000],
    'Department': ['HR', 'Finance', 'IT', 'Marketing', 'Sales']
}

df = pd.DataFrame(data)
print("Original DataFrame:\n")
print(df)

# Define numerical and categorical columns
numerical_cols = ['Age', 'Salary']
categorical_cols = ['Department']

# Create transformers for numerical and categorical data
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(sparse=False)

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Create a preprocessing pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Fit and transform the data
processed_data = pipeline.fit_transform(df)

# Get feature names after transformation
cat_feature_names = pipeline.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out(categorical_cols)
feature_names = numerical_cols + list(cat_feature_names)

# Create a DataFrame with the processed data
processed_df = pd.DataFrame(processed_data, columns=feature_names)
print("\nProcessed DataFrame after Standardization and Encoding:\n")
print(processed_df)

# Validate data types
print("\nData Types of Processed DataFrame:\n")
print(processed_df.dtypes)

# Validate value ranges for standardized numerical columns
print("\nSummary Statistics for Standardized Numerical Columns:\n")
print(processed_df[numerical_cols].describe())

ModuleNotFoundError: No module named 'numpy.char'