# Population-Income Prediction Model

## 1.Import Package

In [1]:
import pandas as pd # read .data / .test plain-text files and convert them into a table.
import numpy as np # using NumPy arrays
import time # time the training and prediction speeds
from sklearn.compose import ColumnTransformer # process the categorical and numeric columns separately, then combine them
from sklearn.preprocessing import OneHotEncoder  # the model can only compare number，this package converts text into one-hot encoded variables.
from sklearn.pipeline import Pipeline # build a single, fixed pipeline that bundles every preprocessing step together with the estimator
from sklearn.tree import DecisionTreeClassifier # provide a decision-tree classifier.
from sklearn.ensemble import RandomForestClassifier # provide a random-forest classifier.
from sklearn.model_selection import train_test_split # split the data into training and test sets.
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report) # import evaluation metrics.
from tqdm import tqdm # progress bar

## 2.Read Data

In [2]:
COL_NAMES = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"
] # adult.data and adult.test have no header row
train_df = pd.read_csv("Census Income Data Set/adult.data",
                       names=COL_NAMES, skipinitialspace=True) # read the dataset file, assign column names, and automatically skip any whitespace.
test_df  = pd.read_csv("Census Income Data Set/adult.test",
                       names=COL_NAMES, skipinitialspace=True, skiprows=1) # skip the first line '|1x3 Cross validator' in thetest set.
df = pd.concat([train_df, test_df], ignore_index=True) # It's like combining two sheets from the same Excel file into one, 
                                                       # with the index reset to make data cleaning easier later on

## 3.Clean Data

In [3]:
df.replace(" ?", pd.NA, inplace=True)   # In the dataset, some missing values are denoted by “?”, replace them with the official missing value “pd.NA”.
df.dropna(inplace=True)                # drop rows with any missing values, without creating a new table.
df["income"] = df["income"].str.strip().apply(
    lambda x: 1 if x in {">50K", ">50K."} else 0) # Change ">50k" to 1 and "<=50k" to 0.

## 4.Split into independent and dependent variables

In [4]:
X = df.drop("income", axis=1) # drop the entire income column to obtain the training features.
y = df["income"] # obtain the prediction target.

## 5.Identify column types

In [5]:
# used to split the dataset into text columns and numeric columns
cat_cols = X.select_dtypes(include="object").columns # for subsequent one-hot encoding
num_cols = X.select_dtypes(exclude="object").columns
# outcome：
# cat_cols  # Index(['workclass', 'education'], dtype='object')
# num_cols  # Index(['age', 'fnlwgt', 'education-num'], dtype='object')

## 6.Transform the dataset into one-hot encoding

In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), cat_cols), # convert this column into a one-hot sparse matrix (0/1) 
                                            # and name the operation "cat" for easy debugging later;
                                            # ignore: when encountering unseen categories during inference, 
                                             # the encoder won't crash; instead, it sets the entire row to 0.
        ("num", "passthrough", num_cols) # passthrough means numeric columns remain unchanged
    ])

## 7.Split the dataset into training and test sets

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# split 80 % train and 20 % test, stratify=y to keep the high-income and low-income ratio identical in both parts, 
# and fix a random seed so the train and test assignment can be reproduced

## 8.Define two models

In [8]:
models = {
    "DecisionTree": DecisionTreeClassifier(max_depth=10, random_state=42),
    "RandomForest": RandomForestClassifier(
        n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
}

## 9.Train + Predict + Evaluate

In [9]:
results = {}
for name, clf in models.items():
    pipe = Pipeline(steps=[("prep", preprocessor), ("clf",   clf)])

    # calculate the training time
    t0 = time.time()
    # pipe.fit(X_train, y_train)
    with tqdm(total=1, desc=f"{name} training") as pbar:
        pipe.fit(X_train, y_train)
        pbar.update(1)
    train_time = time.time() - t0

    # calculate the predicting time
    t0 = time.time()
    # y_pred = pipe.predict(X_test)
    y_pred = []
    with tqdm(total=X_test.shape[0], desc=f"{name} predicting") as pbar:
        for idx in range(X_test.shape[0]):
            y_pred.append(pipe.predict(X_test.iloc[[idx]])[0])
            pbar.update(1)
    y_pred = np.array(y_pred)
    pred_time = time.time() - t0

    # make the training model output the class-probability for every test sample: 
    # a two-column array (col 0: P(<=50K), col 1: P(>50K)) and take only column 1
    y_prob = pipe.predict_proba(X_test)[:, 1]

    results[name] = {
        "accuracy":  accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall":    recall_score(y_test, y_pred),
        "f1":        f1_score(y_test, y_pred),
        "roc_auc":   roc_auc_score(y_test, y_prob),
        "train_time": train_time,
        "pred_time": pred_time
    }

    print(f"\n----------------------------------- {name} -----------------------------------")
    print(classification_report(y_test, y_pred))


DecisionTree training: 100%|█████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.33it/s]
DecisionTree predicting: 100%|████████████████████████████████████████████████████| 9769/9769 [00:42<00:00, 228.86it/s]



----------------------------------- DecisionTree -----------------------------------
              precision    recall  f1-score   support

           0       0.88      0.95      0.91      7431
           1       0.78      0.58      0.67      2338

    accuracy                           0.86      9769
   macro avg       0.83      0.76      0.79      9769
weighted avg       0.85      0.86      0.85      9769



RandomForest training: 100%|█████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.35it/s]
RandomForest predicting: 100%|█████████████████████████████████████████████████████| 9769/9769 [05:04<00:00, 32.13it/s]



----------------------------------- RandomForest -----------------------------------
              precision    recall  f1-score   support

           0       0.87      0.96      0.91      7431
           1       0.80      0.55      0.65      2338

    accuracy                           0.86      9769
   macro avg       0.83      0.75      0.78      9769
weighted avg       0.85      0.86      0.85      9769

