# Data Exploration: UCI Adult Income Dataset

## Overview

The Adult Income dataset predicts whether income exceeds $50K/year based on census data. This is a binary classification problem with mixed data types, making it perfect for demonstrating feature engineering techniques.


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

np.random.seed(42)
plt.style.use('seaborn-v0_8-darkgrid')

# Load dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 
           'marital-status', 'occupation', 'relationship', 'race', 'sex',
           'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

df = pd.read_csv(url, names=columns, na_values=' ?', skipinitialspace=True)
print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
print(df.head())


In [None]:
# Dataset info
print("Dataset Info:")
print(df.info())
print("\nBasic Statistics:")
print(df.describe())
print("\nCategorical columns:")
print(df.select_dtypes(include=['object']).columns.tolist())


In [None]:
# Target distribution
print("Target distribution:")
print(df['income'].value_counts())
print(f"\nTarget distribution (%):")
print(df['income'].value_counts(normalize=True) * 100)

# Visualize
plt.figure(figsize=(8, 5))
df['income'].value_counts().plot(kind='bar', color=['steelblue', 'coral'])
plt.xlabel('Income')
plt.ylabel('Count')
plt.title('Target Distribution')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()


In [None]:
# Prepare target variable
# Check unique income values first
print("Unique income values:", df['income'].unique())
print("Income value counts:\n", df['income'].value_counts())

# Create binary target: 1 if income > 50K, 0 otherwise
# Handle both '>50K' and ' >50K' formats (with or without leading space)
df['income_binary'] = df['income'].str.strip().str.contains('>50K', regex=False).astype(int)

# Verify target distribution
print("\nTarget variable distribution:")
print(df['income_binary'].value_counts())
print(f"\nTarget distribution (%):")
print(df['income_binary'].value_counts(normalize=True) * 100)

# Save preprocessed data
import pickle

# Handle missing values (simple strategy: drop for now, can be improved)
df_clean = df.dropna()

# Split data
X = df_clean.drop(['income', 'income_binary'], axis=1)
y = df_clean['income_binary']

# Verify y has both classes before splitting
print(f"\nBefore split - y unique values: {y.unique()}")
print(f"Before split - y value counts:\n{y.value_counts()}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Verify after split
print(f"\nAfter split - y_train unique values: {y_train.unique()}")
print(f"After split - y_train value counts:\n{y_train.value_counts()}")
print(f"\nAfter split - y_test unique values: {y_test.unique()}")
print(f"After split - y_test value counts:\n{y_test.value_counts()}")

data_dict = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test
}

with open('../adult_data.pkl', 'wb') as f:
    pickle.dump(data_dict, f)

print(f"\nTraining set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print("Data saved to '../adult_data.pkl'")


In [None]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())
print(f"\nTotal missing values: {df.isnull().sum().sum()}")
