# Adult Income Analysis & Prediction

This notebook provides a comprehensive analysis of the Adult Income dataset. It mirrors the functionality of the Streamlit app (`app.py`) and includes:
1.  **Data Cleaning**: Handling missing values and duplicates.
2.  **Exploratory Data Analysis (EDA)**: Interactive visualizations using **Plotly**.
3.  **Supervised Machine Learning**: Predicting income class (>50K vs <=50K) using Random Forest and Logistic Regression.
4.  **Unsupervised Machine Learning**: K-Means Clustering and PCA dimensionality reduction.

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# warnings
import warnings
warnings.filterwarnings('ignore')

## 1. Data Loading and Cleaning

In [None]:
# Define Columns
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 
           'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 
           'hours-per-week', 'native-country', 'income']

# Load Data
try:
    df = pd.read_csv('adultData/adult.data', names=columns, na_values='?', skipinitialspace=True)
    print("Dataset Loaded Successfully. Shape:", df.shape)
except FileNotFoundError:
    print("Error: File not found. Please check the path.")

# Impute Missing Values (Mode)
for col in ['workclass', 'occupation', 'native-country']:
    df[col] = df[col].fillna(df[col].mode()[0])

# Remove Duplicates
df.drop_duplicates(inplace=True)
print("Shape after cleaning:", df.shape)

# Basic Stats
df.describe().T

## 2. Exploratory Data Analysis (Plotly)

In [None]:
# 1. Income Distribution (Pie Chart)
counts = df['income'].value_counts()
fig_pie = px.pie(names=counts.index, values=counts.values, hole=0.4, 
             title="Income Class Distribution", 
             color_discrete_sequence=px.colors.sequential.RdBu)
fig_pie.show()

# 2. Age Distribution by Income (Box Plot)
fig_box = px.box(df, x='income', y='age', color='income', 
             title="Age Distribution by Income", 
             color_discrete_sequence=px.colors.qualitative.Pastel)
fig_box.show()

# 3. Hours per Week Histogram
fig_hist = px.histogram(df, x='hours-per-week', color='income', barmode='group',
                   title="Hours Worked per Week by Income",
                   color_discrete_sequence=px.colors.qualitative.Safe)
fig_hist.show()

# 4. Correlation Heatmap
corr = df.select_dtypes(include=[np.number]).corr()
fig_corr = px.imshow(corr, text_auto=True, aspect="auto", 
                 color_continuous_scale='RdBu_r', 
                 title="Correlation Matrix (Numerical Features)")
fig_corr.show()

## 3. Supervised Machine Learning

In [None]:
# Preprocessing
le = LabelEncoder()
df_model = df.copy()
df_model['income_encoded'] = le.fit_transform(df_model['income'])
df_model = df_model.drop('income', axis=1)

# One-Hot Encoding
df_encoded = pd.get_dummies(df_model, drop_first=True)

X = df_encoded.drop('income_encoded', axis=1)
y = df_encoded['income_encoded']

# Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training Set: {X_train.shape}")
print(f"Test Set: {X_test.shape}")

In [None]:
# Model 1: Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

acc_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {acc_rf:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))

# Confusion Matrix Plot
cm_rf = confusion_matrix(y_test, y_pred_rf)
fig_cm = px.imshow(cm_rf, text_auto=True, color_continuous_scale='Blues',
                   x=['<=50K', '>50K'], y=['<=50K', '>50K'],
                   labels=dict(x="Predicted", y="Actual", color="Count"),
                   title="Confusion Matrix - Random Forest")
fig_cm.show()

In [None]:
# Model 2: Logistic Regression
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

acc_lr = accuracy_score(y_test, y_pred_lr)
print(f"Logistic Regression Accuracy: {acc_lr:.4f}")

## 4. Unsupervised Machine Learning
Here we explore patterns in the data without labels using K-Means and visualize them using PCA.

In [None]:
# Scaling the Data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 1. K-Means Clustering - Elbow Method
wcss = []
k_range = range(1, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

fig_elbow = px.line(x=k_range, y=wcss, markers=True, 
                    labels={'x':'Number of Clusters (K)', 'y':'WCSS'},
                    title="Elbow Method for Optimal K")
fig_elbow.show()

In [None]:
# 2. PCA Visualization
# Using K=3 (example optimal from elbow, or arbitrary choice for demo)
optimal_k = 3
kmeans_final = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
clusters = kmeans_final.fit_predict(X_scaled)

# PCA reduction to 2D
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Create DataFrame for Plotting
pca_df = pd.DataFrame(data=X_pca, columns=['PC1', 'PC2'])
pca_df['Cluster'] = clusters.astype(str)

fig_pca = px.scatter(pca_df, x='PC1', y='PC2', color='Cluster', 
                     title=f"PCA Cluster Visualization (K={optimal_k})",
                     opacity=0.7,
                     color_discrete_sequence=px.colors.qualitative.Prism)
fig_pca.show()

print(f"Explained Variance Ratio by 2 Components: {pca.explained_variance_ratio_}")