# Adult Income Prediction - Data Exploration

This notebook explores the Adult dataset used for income prediction (>50K or <=50K).

In [None]:
# Import libraries
import sys
import os
sys.path.append(os.path.join(os.getcwd(), '..'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.data_loader import load_raw_data
from src.config import config

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

In [None]:
# Load the raw data
df = load_raw_data()
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
df.head()

In [None]:
# Basic statistics
print("Basic info:")
df.info()

print("\nNumerical features statistics:")
df.describe()

print("\nCategorical features value counts:")
for col in config.CATEGORICAL_FEATURES:
    print(f"\n{col}:")
    print(df[col].value_counts().head())

In [None]:
# Check for missing values
print("Missing values per column:")
missing = df.isnull().sum()
missing[missing > 0]

In [None]:
# Target distribution
plt.figure(figsize=(8, 6))
df['income'].value_counts().plot(kind='bar')
plt.title('Income Distribution')
plt.xlabel('Income Category')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

print("Income distribution:")
print(df['income'].value_counts(normalize=True))

In [None]:
# Numerical features distributions
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for i, col in enumerate(config.NUMERICAL_FEATURES):
    if i < len(axes):
        sns.histplot(data=df, x=col, ax=axes[i], kde=True)
        axes[i].set_title(f'Distribution of {col}')

plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix for numerical features
plt.figure(figsize=(10, 8))
corr_matrix = df[config.NUMERICAL_FEATURES].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix - Numerical Features')
plt.show()