In [10]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 
  
# metadata 
print(adult.metadata) 
  
# variable information 
print(adult.variables) 


# Combine features and target into one DataFrame
df = X.copy()
df['income'] = y['income']

# Filter for each of given races
df_white = df[df['race'].astype(str).str.strip().eq('White')].reset_index(drop=True)
df_black = df[df['race'].astype(str).str.strip().eq('Black')].reset_index(drop=True)
df_asian = df[df['race'].astype(str).str.strip().eq('Asian-Pac-Islander')].reset_index(drop=True)
df_native = df[df['race'].astype(str).str.strip().eq('Amer-Indian-Eskimo')].reset_index(drop=True)
df_other = df[df['race'].astype(str).str.strip().eq('Other')].reset_index(drop=True)
# Quick check
print(df_white.shape)
print(df_white.head())

{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether annual income of an individual exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Tue Sep 24 2024', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': "Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the fol

In [8]:
import numpy as np
import pandas as pd

# changes all of the income into 1 and 0 based on if over 50k or if less than
m_gt = df_white['income'].astype(str).str.contains(r'>\s*50\s*K?', case=False, na=False)
m_le = df_white['income'].astype(str).str.contains(r'<=\s*50\s*K?', case=False, na=False)
df_white['income_binary'] = np.select([m_gt, m_le], [1, 0], default=np.nan)  # or default=0 if you prefer
print(df_white['income_binary'].head())
print(df_white['income_binary'].value_counts(dropna=False))
df_white.head()
df_white.drop(columns=['income'], inplace=True)

0    0.0
1    0.0
2    0.0
3    0.0
4    1.0
Name: income_binary, dtype: float64
income_binary
0.0    31155
1.0    10607
Name: count, dtype: int64


In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# --- start from your dataframe: df_white ---
# Ensure that is either 1 or 0 
if df_white['income_binary'].dtype == 'O':
    y_white = (df_white['income_binary'].astype(str)
               .str.strip()
               .replace({'>50K': 1, '<=50K': 0, '> 50K': 1, '<= 50K': 0})
               .astype(int))
else:
    y_white = df_white['income_binary'].astype(int)

# drops unwanted columns from X df
X_white = df_white.drop(columns=['income_binary', 'fnlwgt', 'race'], errors='ignore').copy()

# Identify numeric vs categorical
num_cols_white = X_white.select_dtypes(include=['number', 'bool']).columns.tolist()
cat_cols_white = [c for c in X_white.columns if c not in num_cols_white]

# Clean categorical text and mark nans 
for c in cat_cols_white:
    X_white[c] = X_white[c].astype(str).str.strip().replace({'?': np.nan})
    X_white[c] = X_white[c].replace(r'(?i)^nan$', np.nan, regex=True)


# split up and ensure proportion 
X_white_train, X_white_test, y_white_train, y_white_test = train_test_split(
    X_white, y_white, test_size=0.2, random_state=0, stratify=y_white
)

# Preprocessor: simple impute fills missing values with the most frequent category in that column + OHE
preprocess_white = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), num_cols_white),
        ('cat', Pipeline([
            ('imp', SimpleImputer(strategy='most_frequent')),
            ('ohe', OneHotEncoder(handle_unknown='ignore'))
        ]), cat_cols_white),
    ],
    remainder='drop'
)

# Fit on train; transform both splits (optional if you’ll use the pipeline below)
preprocess_white.fit(X_white_train)
X_white_train_enc = preprocess_white.transform(X_white_train)
X_white_test_enc  = preprocess_white.transform(X_white_test)
print("Encoded shapes:", X_white_train_enc.shape, X_white_test_enc.shape)
feature_names = preprocess_white.get_feature_names_out()
print("First 10 features:", feature_names[:50])





Encoded shapes: (33409, 97) (8353, 97)
First 10 features: ['num__age' 'num__education-num' 'num__capital-gain' 'num__capital-loss'
 'num__hours-per-week' 'cat__workclass_Federal-gov'
 'cat__workclass_Local-gov' 'cat__workclass_Never-worked'
 'cat__workclass_Private' 'cat__workclass_Self-emp-inc'
 'cat__workclass_Self-emp-not-inc' 'cat__workclass_State-gov'
 'cat__workclass_Without-pay' 'cat__education_10th' 'cat__education_11th'
 'cat__education_12th' 'cat__education_1st-4th' 'cat__education_5th-6th'
 'cat__education_7th-8th' 'cat__education_9th' 'cat__education_Assoc-acdm'
 'cat__education_Assoc-voc' 'cat__education_Bachelors'
 'cat__education_Doctorate' 'cat__education_HS-grad'
 'cat__education_Masters' 'cat__education_Preschool'
 'cat__education_Prof-school' 'cat__education_Some-college'
 'cat__marital-status_Divorced' 'cat__marital-status_Married-AF-spouse'
 'cat__marital-status_Married-civ-spouse'
 'cat__marital-status_Married-spouse-absent'
 'cat__marital-status_Never-married' 'c

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

# Pipeline: reuse your fitted preprocessor object 1000 decision trees 
rf_white = Pipeline([
    ('prep', preprocess_white),
    ('rf', RandomForestClassifier(
        n_estimators=1000,
        random_state=0,
        n_jobs=-1,
        max_depth=None,
        class_weight='balanced_subsample'  # makes balanced
    ))
])

# Train
rf_white.fit(X_white_train, y_white_train)

# Evaluate
y_white_pred = rf_white.predict(X_white_test)
print("Accuracy:", accuracy_score(y_white_test, y_white_pred))
print("\nClassification report:\n", classification_report(y_white_test, y_white_pred, digits=3))

# Top features
# importances = rf_white.named_steps['rf'].feature_importances_
# feat_names = rf_white.named_steps['prep'].get_feature_names_out()
# order = np.argsort(importances)[::-1][:15]
# print("\nTop 15 features:")
# for i in order:
#     print(f"{feat_names[i]:40s} {importances[i]:.4f}")


Accuracy: 0.8311983718424518

Classification report:
               precision    recall  f1-score   support

           0      0.882     0.893     0.888      6231
           1      0.674     0.650     0.662      2122

    accuracy                          0.831      8353
   macro avg      0.778     0.772     0.775      8353
weighted avg      0.829     0.831     0.830      8353



In [13]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score

def evaluate_rf_depths(
    df,
    target_col='income_binary',
    drop_cols=('fnlwgt', 'race'),
    depths=range(1, 21),           # try depths 1..20 (change as you like)
    test_size=0.2,
    random_state=0,
    rf_kwargs=None                 # extra RF kwargs if you want to tweak
):
    # pair column to 1 or 0 based on income values 
    y_raw = df[target_col]
    if y_raw.dtype == 'O':
        s = (y_raw.astype(str).str.strip().str.upper()
                       .str.replace(r'\.$', '', regex=True)
                       .str.replace(r'\s+', '', regex=True))
        y = s.map({'>50K': 1, '<=50K': 0}).astype(int)
    else:
        y = y_raw.astype(int)

    # drop cols 
    X = df.drop(columns=[target_col, *drop_cols], errors='ignore').copy()

    # split numeric and categorical columns 
    num_cols = X.select_dtypes(include=['number', 'bool']).columns.tolist()
    cat_cols = [c for c in X.columns if c not in num_cols]

    #  cleaning for categoricals: trim; convert '?' and literal 'nan' to NaN
    for c in cat_cols:
        X[c] = X[c].astype(str).str.strip()
        X[c] = X[c].replace({'?': np.nan})
        X[c] = X[c].replace(r'(?i)^nan$', np.nan, regex=True)

    # Single split reused for every model
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    # Preprocessor: impute numerics + categoricals; OHE categoricals
    preproc = ColumnTransformer(
        transformers=[
            ('num', SimpleImputer(strategy='median'), num_cols),
            ('cat', Pipeline([
                ('imp', SimpleImputer(strategy='most_frequent')),
                ('ohe', OneHotEncoder(handle_unknown='ignore'))
            ]), cat_cols),
        ],
        remainder='drop'
    )

    # Base RF settings 1000 decision trees
    base_rf = dict(
        n_estimators=1000,
        n_jobs=-1,
        random_state=random_state,
        class_weight='balanced_subsample'
    )
    if rf_kwargs:
        base_rf.update(rf_kwargs)

    #loop over depths 
    rows = []
    for d in depths:
        # builds a pipeline preprocessing RandomForest(max_depth=d).
        clf = Pipeline([
            ('prep', preproc),
            ('rf', RandomForestClassifier(max_depth=d, **base_rf))
        ])
        # fits the model on the training set 
        clf.fit(X_tr, y_tr)

        # model precision and accuracy scoring 
        y_pred = clf.predict(X_te)
        acc = accuracy_score(y_te, y_pred)
        prec, rec, f1, _ = precision_recall_fscore_support(
            y_te, y_pred, average=None, labels=[0,1], zero_division=0
        )
        # ROC-AUC (needs probabilities; if a class is missing in train it may error)
        try:
            y_proba = clf.predict_proba(X_te)[:, 1]
            auc = roc_auc_score(y_te, y_proba)
        except Exception:
            auc = np.nan

        rows.append({
            'max_depth': d,
            'accuracy': acc,
            'precision_0': prec[0], 'recall_0': rec[0], 'f1_0': f1[0],
            'precision_1': prec[1], 'recall_1': rec[1], 'f1_1': f1[1],
            'roc_auc': auc
        })

    results = pd.DataFrame(rows).sort_values('max_depth').reset_index(drop=True)
    return results


In [11]:

import numpy as np
import pandas as pd
df_white.head()

m_gt = df_white['income'].astype(str).str.contains(r'>\s*50\s*K?', case=False, na=False)
m_le = df_white['income'].astype(str).str.contains(r'<=\s*50\s*K?', case=False, na=False)
df_white['income_binary'] = np.select([m_gt, m_le], [1, 0], default=np.nan)  # or default=0 if you prefer
print(df_white['income_binary'].head())
print(df_white['income_binary'].value_counts(dropna=False))
df_white.head()
df_white.drop(columns=['income'], inplace=True)



0    0.0
1    0.0
2    0.0
3    0.0
4    1.0
Name: income_binary, dtype: float64
income_binary
0.0    31155
1.0    10607
Name: count, dtype: int64


In [14]:
evaluate_rf_depths(df_white)


Unnamed: 0,max_depth,accuracy,precision_0,recall_0,f1_0,precision_1,recall_1,f1_1,roc_auc
0,1,0.72046,0.920916,0.683999,0.784971,0.471409,0.827521,0.60065,0.861696
1,2,0.723093,0.927543,0.682074,0.786091,0.474675,0.843544,0.6075,0.868124
2,3,0.731713,0.932943,0.689937,0.793247,0.484112,0.854383,0.618033,0.876324
3,4,0.736741,0.942299,0.689295,0.796181,0.489855,0.87606,0.628359,0.882444
4,5,0.74141,0.945113,0.693629,0.800074,0.494974,0.881715,0.634022,0.888457
5,6,0.746917,0.947014,0.699888,0.80491,0.501067,0.885014,0.639864,0.893546
6,7,0.759607,0.949925,0.715455,0.816185,0.515574,0.889255,0.652715,0.898207
7,8,0.774452,0.951402,0.735195,0.829441,0.533635,0.889727,0.667138,0.901306
8,9,0.781994,0.950368,0.74675,0.836344,0.543535,0.885485,0.673597,0.904241
9,10,0.78834,0.94999,0.756058,0.842002,0.552151,0.883129,0.679478,0.906134


In [42]:

import numpy as np
import pandas as pd
df_black.head()

m_gt = df_black['income'].astype(str).str.contains(r'>\s*50\s*K?', case=False, na=False)
m_le = df_black['income'].astype(str).str.contains(r'<=\s*50\s*K?', case=False, na=False)
df_black['income_binary'] = np.select([m_gt, m_le], [1, 0], default=np.nan)  # or default=0 if you prefer
print(df_black['income_binary'].head())
print(df_black['income_binary'].value_counts(dropna=False))
df_black.head()
df_black.drop(columns=['income'], inplace=True)


0    0.0
1    0.0
2    0.0
3    1.0
4    0.0
Name: income_binary, dtype: float64
income_binary
0.0    4119
1.0     566
Name: count, dtype: int64


In [43]:
evaluate_rf_depths(df_black)


Unnamed: 0,max_depth,accuracy,precision_0,recall_0,f1_0,precision_1,recall_1,f1_1,roc_auc
0,1,0.81857,0.969828,0.819175,0.888158,0.381743,0.814159,0.519774,0.905646
1,2,0.822839,0.972701,0.821602,0.890789,0.390041,0.831858,0.531073,0.915118
2,3,0.826041,0.97554,0.822816,0.892693,0.396694,0.849558,0.540845,0.919404
3,4,0.828175,0.978355,0.822816,0.893869,0.401639,0.867257,0.54902,0.923485
4,5,0.830309,0.982583,0.821602,0.894911,0.407258,0.893805,0.559557,0.926937
5,6,0.833511,0.984058,0.824029,0.896962,0.412955,0.902655,0.566667,0.928806
6,7,0.83778,0.978632,0.833738,0.900393,0.417021,0.867257,0.563218,0.929735
7,8,0.846318,0.978873,0.843447,0.906128,0.431718,0.867257,0.576471,0.930702
8,9,0.851654,0.977685,0.850728,0.909799,0.440909,0.858407,0.582583,0.931733
9,10,0.863394,0.974114,0.867718,0.917843,0.463054,0.831858,0.594937,0.931647


In [35]:

import numpy as np
import pandas as pd
df_asian.head()

m_gt = df_asian['income'].astype(str).str.contains(r'>\s*50\s*K?', case=False, na=False)
m_le = df_asian['income'].astype(str).str.contains(r'<=\s*50\s*K?', case=False, na=False)
df_asian['income_binary'] = np.select([m_gt, m_le], [1, 0], default=np.nan)  # or default=0 if you prefer
print(df_asian['income_binary'].head())
print(df_asian['income_binary'].value_counts(dropna=False))
df_asian.head()
df_asian.drop(columns=['income'], inplace=True)

KeyError: 'income'

In [36]:
evaluate_rf_depths(df_asian)


Unnamed: 0,max_depth,accuracy,precision_0,recall_0,f1_0,precision_1,recall_1,f1_1,roc_auc
0,1,0.703947,0.902439,0.666667,0.766839,0.471429,0.804878,0.594595,0.808449
1,2,0.707237,0.907975,0.666667,0.768831,0.475177,0.817073,0.600897,0.812074
2,3,0.723684,0.920732,0.68018,0.782383,0.492857,0.841463,0.621622,0.817925
3,4,0.723684,0.920732,0.68018,0.782383,0.492857,0.841463,0.621622,0.823308
4,5,0.740132,0.918129,0.707207,0.798982,0.511278,0.829268,0.632558,0.827428
5,6,0.75,0.910112,0.72973,0.81,0.52381,0.804878,0.634615,0.832921
6,7,0.766447,0.908108,0.756757,0.825553,0.546218,0.792683,0.646766,0.835009
7,8,0.763158,0.903226,0.756757,0.823529,0.542373,0.780488,0.64,0.836327
8,9,0.766447,0.895288,0.77027,0.828087,0.548673,0.756098,0.635897,0.83814
9,10,0.773026,0.896373,0.779279,0.833735,0.558559,0.756098,0.642487,0.838909


In [None]:

import numpy as np
import pandas as pd
df_native.head()

m_gt = df_native['income'].astype(str).str.contains(r'>\s*50\s*K?', case=False, na=False)
m_le = df_native['income'].astype(str).str.contains(r'<=\s*50\s*K?', case=False, na=False)
df_native['income_binary'] = np.select([m_gt, m_le], [1, 0], default=np.nan)  # or default=0 if you prefer
print(df_native['income_binary'].head())
print(df_native['income_binary'].value_counts(dropna=False))
df_native.head()
df_native.drop(columns=['income'], inplace=True)

In [None]:
evaluate_rf_depths(df_native)
