In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Column names from adult.names
columns = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num',
    'marital-status', 'occupation', 'relationship', 'race', 'sex',
    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'
]

# Load training and test data
df_train = pd.read_csv('adult/adult.data', names=columns, sep=',', skipinitialspace=True)
df_test = pd.read_csv('adult/adult.test', names=columns, sep=',', skipinitialspace=True, skiprows=1)

# Remove period from income labels in test set
df_test['income'] = df_test['income'].str.replace('.', '', regex=False)

# Combine datasets
df = pd.concat([df_train, df_test], ignore_index=True)

# Clean text and replace '?' with NaN
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
df.replace('?', pd.NA, inplace=True)
df.dropna(inplace=True)

# Lowercase categorical values
categorical_cols = df.select_dtypes(include='object').columns
df[categorical_cols] = df[categorical_cols].apply(lambda col: col.str.lower())

# Label Encode the target
df['income'] = df['income'].map({'<=50k': 0, '>50k': 1})

# Separate features and target
X = df.drop('income', axis=1)
y = df['income']

# Identify categorical and numeric columns
categorical_columns = [
    'workclass', 'education', 'marital-status', 'occupation',
    'relationship', 'race', 'sex', 'native-country'
]
numeric_columns = [
    'age', 'fnlwgt', 'education-num', 'capital-gain',
    'capital-loss', 'hours-per-week'
]

# Label encode categorical columns
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le  # store encoder if you want to decode later

# Scale numeric columns
scaler = MinMaxScaler()
X[numeric_columns] = scaler.fit_transform(X[numeric_columns])

# Final dataset with encoded features
df_final = pd.concat([X, y], axis=1)

# Save the preprocessed dataset
df_final.to_csv('Processed_Adult.csv', index=False)

df_final

  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,0.301370,5,0.043350,9,0.800000,4,0,1,4,1,0.021740,0.0,0.397959,38,0
1,0.452055,4,0.047274,9,0.800000,2,3,0,4,1,0.000000,0.0,0.122449,38,0
2,0.287671,2,0.136877,11,0.533333,0,5,1,4,1,0.000000,0.0,0.397959,38,0
3,0.493151,2,0.149792,1,0.400000,2,5,0,2,1,0.000000,0.0,0.397959,38,0
4,0.150685,2,0.219998,9,0.800000,2,9,5,2,0,0.000000,0.0,0.397959,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48836,0.219178,2,0.156895,9,0.800000,4,9,3,4,1,0.000000,0.0,0.397959,38,0
48837,0.301370,2,0.136723,9,0.800000,0,9,1,4,0,0.000000,0.0,0.357143,38,0
48839,0.287671,2,0.244762,9,0.800000,2,9,0,4,1,0.000000,0.0,0.500000,38,0
48840,0.369863,2,0.047666,9,0.800000,0,0,3,1,1,0.054551,0.0,0.397959,38,0
