# Wide and Deep Neural Network for Adult Census Income Prediction
This notebook implements a Wide and Deep Neural Network using **Keras (with PyTorch backend)** to predict income classes from the Adult dataset.

In [None]:
import os
# Set backend to PyTorch before importing Keras
os.environ["KERAS_BACKEND"] = "torch"

import pandas as pd
import numpy as np
import keras
from keras import layers, Model, Input, ops
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Set seeds for reproducibility
np.random.seed(42)
keras.utils.set_random_seed(42)

print(f"Keras version: {keras.__version__}")
print(f"Backend: {keras.config.backend()}")


In [None]:
# Load the dataset
# The dataset does not have headers, so we define them manually
column_names = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "gender",
    "capital_gain", "capital_loss", "hours_per_week", "native_country", "income"
]

# Read from the zip file located in datasets/
print("Loading data...")
try:
    df = pd.read_csv('datasets/adult_test.csv.zip', names=column_names, skipinitialspace=True)
except Exception as e:
    print(f"Error loading zip directly: {e}")
    # Fallback if needed

# Inspect the stored dataframe
print(f"Dataset Shape: {df.shape}")
df.head()


In [None]:
# Data Cleaning

# The dataset might contain rows that are not actual data (like the 1x3 cross validator comment)
df = df.replace('?', np.nan)
df.dropna(inplace=True)

# The target 'income' column in the test set usually ends with a dot (e.g., '<=50K.', '>50K.')
# Let's clean it up
df['income'] = df['income'].astype(str).str.rstrip('.')

# Verify target values
print("Target value counts:")
print(df['income'].value_counts())

# Map to binary
df['income'] = df['income'].apply(lambda x: 1 if '>50K' in x else 0)

print(f"Cleaned Dataset Shape: {df.shape}")


In [None]:
# Feature Engineering

# Define feature groups
CATEGORICAL_COLS = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "gender", "native_country"]
CONTINUOUS_COLS = ["age", "fnlwgt", "education_num", "capital_gain", "capital_loss", "hours_per_week"]

# Prepare Input Data

# 1. Continuous Features: Standard Scaling
scaler = StandardScaler()
X_continuous = scaler.fit_transform(df[CONTINUOUS_COLS])

# 2. Categorical Features for Deep Part: Label Encoding
X_categorical_indices = []
vocab_sizes = {}

for col in CATEGORICAL_COLS:
    le = LabelEncoder()
    df[col] = df[col].astype(str)
    col_indices = le.fit_transform(df[col])
    X_categorical_indices.append(col_indices)
    vocab_sizes[col] = len(le.classes_)

X_categorical_indices = np.stack(X_categorical_indices, axis=1)

# 3. Categorical Features for Wide Part: One-Hot Encoding
df_onehot = pd.get_dummies(df[CATEGORICAL_COLS])
X_wide = df_onehot.values.astype('float32') # Ensure float32 for Keras

# Split Data
y = df['income'].values.astype('float32')

indices = np.arange(len(y))
train_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42)

X_wide_train, X_wide_test = X_wide[train_idx], X_wide[test_idx]
X_deep_cat_train, X_deep_cat_test = X_categorical_indices[train_idx], X_categorical_indices[test_idx]
X_deep_cont_train, X_deep_cont_test = X_continuous[train_idx], X_continuous[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

print(f"Train samples: {len(y_train)}")
print(f"Test samples: {len(y_test)}")


In [None]:
# Build Wide and Deep Model

# Input Layers
input_wide = Input(shape=(X_wide.shape[1],), name='wide_input')
input_deep_cont = Input(shape=(X_continuous.shape[1],), name='deep_continuous_input')
input_deep_cats = [Input(shape=(1,), name=f'deep_cat_{col}') for col in CATEGORICAL_COLS]

# Deep Part: Embeddings + Continuous
embeddings = []
# Ensure continuous input is float32
deep_cont_cast = layers.Cast('float32')(input_deep_cont)

for i, col in enumerate(CATEGORICAL_COLS):
    voc_size = vocab_sizes[col]
    emb_dim = min(50, (voc_size + 1) // 2)
    # Embedding expects integer indices
    emb = layers.Embedding(input_dim=voc_size, output_dim=emb_dim)(input_deep_cats[i])
    emb = layers.Flatten()(emb)
    embeddings.append(emb)

deep_features = layers.concatenate(embeddings + [deep_cont_cast])
deep_hidden = layers.Dense(128, activation='relu')(deep_features)
deep_hidden = layers.Dropout(0.3)(deep_hidden)
deep_hidden = layers.Dense(64, activation='relu')(deep_hidden)
deep_hidden = layers.Dropout(0.3)(deep_hidden)

# Wide Part
# Combined
combined = layers.concatenate([input_wide, deep_hidden])

# Final Output
output = layers.Dense(1, activation='sigmoid')(combined)

# Create Model
model_inputs = [input_wide] + [input_deep_cont] + input_deep_cats
model = Model(inputs=model_inputs, outputs=output)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'AUC'])
model.summary()


In [None]:
# Prepare inputs list for training
def prepare_inputs(X_wide_d, X_deep_cont_d, X_deep_cat_d):
    cat_inputs = [X_deep_cat_d[:, i] for i in range(X_deep_cat_d.shape[1])]
    return [X_wide_d, X_deep_cont_d] + cat_inputs

train_inputs = prepare_inputs(X_wide_train, X_deep_cont_train, X_deep_cat_train)
test_inputs = prepare_inputs(X_wide_test, X_deep_cont_test, X_deep_cat_test)

# Train
# Keras 3 with Torch works similarly to standard Keras
history = model.fit(
    train_inputs, y_train,
    epochs=15,
    batch_size=64,
    validation_data=(test_inputs, y_test)
)


In [None]:
# Evaluate
metrics = model.evaluate(test_inputs, y_test)
# Keras 3 model.evaluate returns valid list or dict depending on usage, usually list of scalars if no return_dict=True
# But let's check structure
print(f"Metrics: {metrics}")

# Plot History
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.legend()
plt.title('Loss')

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Train Acc')
plt.plot(history.history['val_accuracy'], label='Val Acc')
plt.legend()
plt.title('Accuracy')
plt.show()

# Predictions for Confusion Matrix
y_pred_prob = model.predict(test_inputs)
y_pred = (y_pred_prob > 0.5).astype(int)

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

print(classification_report(y_test, y_pred))
