In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [None]:
# Load data
df = pd.read_csv("../Datum/data2.csv")

In [3]:
# Separate features and target
X = df.drop("target", axis=1)
y = df["target"]

In [4]:
# Preprocessing pipeline
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object", "str"]).columns

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [5]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Fit and transform
X_trained_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

In [7]:
X_trained_scaled[:5]

array([[ 0.39764809,  0.23225826,  1.        ,  0.        ,  0.        ,
         0.        ,  1.        ],
       [-1.4339431 , -1.31613013,  0.        ,  1.        ,  0.        ,
         0.        ,  1.        ],
       [ 1.16884438,  1.47096897,  1.        ,  0.        ,  0.        ,
         1.        ,  0.        ],
       [ 0.6868467 ,  0.54193594,  0.        ,  0.        ,  1.        ,
         0.        ,  1.        ],
       [ 0.10844948, -0.07741942,  0.        ,  0.        ,  1.        ,
         0.        ,  1.        ]])

In [8]:
# Get feature names after one-hot encoding
ohe_features = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
all_features = np.concatenate([numeric_features, ohe_features])

In [9]:
# Convert to DataFrame
X_train_df = pd.DataFrame(X_trained_scaled, columns=all_features)

In [10]:
# Show first 5 rows
print(X_train_df.head())

        age    salary  department_Finance  department_HR  department_IT  \
0  0.397648  0.232258                 1.0            0.0            0.0   
1 -1.433943 -1.316130                 0.0            1.0            0.0   
2  1.168844  1.470969                 1.0            0.0            0.0   
3  0.686847  0.541936                 0.0            0.0            1.0   
4  0.108449 -0.077419                 0.0            0.0            1.0   

   gender_F  gender_M  
0       0.0       1.0  
1       0.0       1.0  
2       1.0       0.0  
3       0.0       1.0  
4       0.0       1.0  
