In [4]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Define column names based on the standard Adult Income dataset
column_names = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"
]

# Load dataset, specifying header=None and providing column names
data = pd.read_csv("/content/adult income.csv", header=None, names=column_names)

# Display first 5 rows to verify the correct loading
data.head()

# Identify initial categorical and numerical features after correct loading
# We get initial_categorical_features before 'income' is transformed
initial_categorical_features = data.select_dtypes(include=["object"]).columns
numerical_features = data.select_dtypes(include=["int64", "float64"]).columns

print("Initial Categorical Features:", initial_categorical_features)
print("Numerical Features:", numerical_features)

# Apply LabelEncoder to the 'income' column (target variable)
label_encoder = LabelEncoder()
data["income"] = label_encoder.fit_transform(data["income"])

data["income"].value_counts()

# Define features to be one-hot encoded: all initial categorical features EXCEPT 'income'
# We use .drop('income') to ensure 'income' is not one-hot encoded, as it's already label encoded.
onehot_features = initial_categorical_features.drop("income")

preprocessor = ColumnTransformer(
    transformers=[
        ("onehot", OneHotEncoder(handle_unknown="ignore", drop="first"), onehot_features),
        ("scaler", StandardScaler(), numerical_features)
    ]
)
processed_data = preprocessor.fit_transform(data)

processed_df = pd.DataFrame(
    processed_data.toarray() if hasattr(processed_data, "toarray") else processed_data
)

processed_df.head()
processed_df.to_csv("adult_preprocessed.csv", index=False)

Initial Categorical Features: Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country', 'income'],
      dtype='object')
Numerical Features: Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week'],
      dtype='object')
