# Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, OrdinalEncoder
from sklearn.compose import ColumnTransformer

# Import Dataset

In [15]:
df = pd.read_csv("../data/raw/adult.csv")

# Seperate Features and Target

In [17]:
df['class'] = df['class'].map({
    '<=50K': 1,
    '>50K': 0
})

y = df['class']
X = df.drop('class', axis=1)

# Train-Test Split (Stratified)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42, stratify=y)

# Define Column Groups

In [21]:
categorical_cols = X_train.select_dtypes(include="object").columns.tolist()

In [22]:
numerical_cols = X_train.select_dtypes(include="int64").columns.tolist()

In [23]:
categorical_cols.remove("education")

# Numerical Pipeline

In [24]:
num_pipeline = Pipeline([
    ("Scaler", StandardScaler())
])

# Categorical Pipeline

In [25]:
category_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", drop="first"))
])

# Education Pipline

In [26]:
education_order = [
    "Preschool",
    "1st-4th",
    "5th-6th",
    "7th-8th",
    "9th",
    "10th",
    "11th",
    "12th",
    "HS-grad",
    "Some-college",
    "Assoc-voc",
    "Assoc-acdm",
    "Bachelors",
    "Masters",
    "Prof-school",
    "Doctorate"
]
edu_pipeline = Pipeline([
    ("ordinalenc", OrdinalEncoder(categories=[education_order]))
])

# Combine with CoulmnTransformer

In [27]:
preprocessor = ColumnTransformer([
    ("num", num_pipeline, numerical_cols),
    ("cat", category_pipeline, categorical_cols),
    ("edu", edu_pipeline, ["education"])
])

# Test Processor

In [28]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [29]:
X_train_processed.shape

(39073, 83)

In [30]:
X_test_processed.shape

(9769, 83)