In [1]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 
  
# metadata 
print(adult.metadata) 
  
# variable information 
print(adult.variables) 


{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether annual income of an individual exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Tue Sep 24 2024', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': "Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the fol

In [2]:
# Combine features and target into one DataFrame
df = X.copy()
df['income'] = y['income']

# Filter to White individuals and reset the index
df_white = df[df['race'].astype(str).str.strip().eq('White')].reset_index(drop=True)

# Quick check
print(df_white.shape)
print(df_white.head())

(41762, 15)
   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   37           Private  284582    Masters             14   
4   52  Self-emp-not-inc  209642    HS-grad              9   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse    Exec-managerial           Wife  White  Female   
4  Married-civ-spouse    Exec-managerial        Husband  White    Male   

   capital-gain  capital-loss  hours-per-week native-country income  
0          2174             0              40  United-States  <=50K  
1             0             0 

In [17]:
import numpy as np
import pandas as pd
df_white.head()
# m_gt = df_white['income'].astype(str).str.contains(r'>\s*50\s*K?', case=False, na=False)
# m_le = df_white['income'].astype(str).str.contains(r'<=\s*50\s*K?', case=False, na=False)

df_white['income_binary'] = np.select([m_gt, m_le], [1, 0], default=np.nan)  # or default=0 if you prefer
print(df_white['income_binary'].head())
print(df_white['income_binary'].value_counts(dropna=False))
df_white.head()



0    0.0
1    0.0
2    0.0
3    0.0
4    1.0
Name: income_binary, dtype: float64
income_binary
0.0    31155
1.0    10607
Name: count, dtype: int64


Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country,income_binary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,Male,2174,0,40,United-States,0.0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Male,0,0,13,United-States,0.0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,Male,0,0,40,United-States,0.0
3,37,Private,Masters,14,Married-civ-spouse,Exec-managerial,Wife,Female,0,0,40,United-States,0.0
4,52,Self-emp-not-inc,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,Male,0,0,45,United-States,1.0


In [21]:
X_white = df_white.iloc[:, :12]
X_white.head()
y_white = df_white['income_binary']
y_white.head()


0    0.0
1    0.0
2    0.0
3    0.0
4    1.0
Name: income_binary, dtype: float64

In [22]:
from sklearn.model_selection import train_test_split

X_white_train, X_white_test, y_white_train, y_white_test = train_test_split(X_white, y_white, test_size=0.2, random_state=42)


In [23]:
from sklearn.ensemble import RandomForestClassifier

rf_white = RandomForestClassifier()

rf_white.fit(X_white_train, y_white_train)

ValueError: could not convert string to float: 'Private'

In [None]:
y_white_pred = rf_white.predict(X_white_test)

In [None]:
rf_white.score(X_white_test, y_white_test)

In [4]:
# 1) Prepare data
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# df_white already exists with all features + 'income'
df_model = df_white.copy()
df_model['income_bin'] = (df_model['income'].astype(str).str.strip().eq('>50K')).astype(int)
X = df_model.drop(columns=['income', 'income_bin'])
y = df_model['income_bin']

# Identify column types
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(exclude=['object', 'category']).columns.tolist()

# 2) Preprocess + model pipeline
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
])

preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numeric_cols),
        ('cat', categorical_pipeline, categorical_cols),
    ],
    remainder='drop'
)

model = RandomForestClassifier(
    n_estimators=400,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    n_jobs=-1,
    random_state=42
)

clf = Pipeline(steps=[('prep', preprocess), ('rf', model)])

# 3) Train/test split and fit
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

clf.fit(X_train, y_train)

# 4) Evaluate
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f'Accuracy: {acc:.3f}')
print(classification_report(y_test, y_pred, digits=3))

Accuracy: 0.838
              precision    recall  f1-score   support

           0      0.878     0.934     0.905      6929
           1      0.536     0.369     0.437      1424

    accuracy                          0.838      8353
   macro avg      0.707     0.652     0.671      8353
weighted avg      0.820     0.838     0.826      8353



In [6]:
# Function to test tree depth vs accuracy for each race
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

def test_tree_depth_for_race(df, race_name, max_depth_range=range(1, 21)):
    """
    Test different tree depths to find minimum depth for 90% accuracy
    """
    # Filter to specific race
    df_race = df[df['race'].astype(str).str.strip().eq(race_name)].reset_index(drop=True)
    
    if len(df_race) < 100:  # Skip if too few samples
        print(f"Not enough samples for {race_name}: {len(df_race)}")
        return None
    
    print(f"\n=== Testing {race_name} (n={len(df_race)}) ===")
    
    # Prepare data
    df_model = df_race.copy()
    df_model['income_bin'] = (df_model['income'].astype(str).str.strip().eq('>50K')).astype(int)
    X = df_model.drop(columns=['income', 'income_bin'])
    y = df_model['income_bin']
    
    # Check class distribution
    print(f"Class distribution: {y.value_counts().to_dict()}")
    
    # Identify column types
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    numeric_cols = X.select_dtypes(exclude=['object', 'category']).columns.tolist()
    
    # Preprocessing pipeline
    numeric_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
    ])
    
    categorical_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
    ])
    
    preprocess = ColumnTransformer(
        transformers=[
            ('num', numeric_pipeline, numeric_cols),
            ('cat', categorical_pipeline, categorical_cols),
        ],
        remainder='drop'
    )
    
    # Test different depths
    results = []
    
    for depth in max_depth_range:
        # Create model with specific depth
        model = RandomForestClassifier(
            n_estimators=100,  # Fewer trees for faster testing
            max_depth=depth,
            min_samples_split=2,
            min_samples_leaf=1,
            n_jobs=-1,
            random_state=42
        )
        
        clf = Pipeline(steps=[('prep', preprocess), ('rf', model)])
        
        # Train/test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.20, random_state=42, stratify=y
        )
        
        # Fit and predict
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        
        results.append({'depth': depth, 'accuracy': acc})
        
        print(f"Depth {depth:2d}: Accuracy = {acc:.3f}")
        
        # Stop if we reach 90% accuracy
        if acc >= 0.90:
            print(f"*** 90% accuracy reached at depth {depth} for {race_name} ***")
            break
    
    return pd.DataFrame(results)

# Test for White individuals first
print("Testing tree depth requirements for different races...")
white_results = test_tree_depth_for_race(df, 'White')


Testing tree depth requirements for different races...

=== Testing White (n=41762) ===
Class distribution: {0: 34645, 1: 7117}
Depth  1: Accuracy = 0.830
Depth  2: Accuracy = 0.830
Depth  3: Accuracy = 0.830
Depth  4: Accuracy = 0.833
Depth  5: Accuracy = 0.837
Depth  6: Accuracy = 0.836
Depth  7: Accuracy = 0.839
Depth  8: Accuracy = 0.841
Depth  9: Accuracy = 0.843
Depth 10: Accuracy = 0.844
Depth 11: Accuracy = 0.844
Depth 12: Accuracy = 0.845
Depth 13: Accuracy = 0.847
Depth 14: Accuracy = 0.845
Depth 15: Accuracy = 0.847
Depth 16: Accuracy = 0.845
Depth 17: Accuracy = 0.846
Depth 18: Accuracy = 0.846
Depth 19: Accuracy = 0.844
Depth 20: Accuracy = 0.844


In [7]:
# Test all races and compare results
print("Checking available races in dataset:")
print(df['race'].value_counts())

# Test each race
races_to_test = ['White', 'Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other']
all_results = {}

for race in races_to_test:
    results = test_tree_depth_for_race(df, race)
    if results is not None:
        all_results[race] = results

# Summary of results
print("\n" + "="*60)
print("SUMMARY: Minimum tree depth for 90% accuracy")
print("="*60)

for race, results_df in all_results.items():
    if len(results_df) > 0:
        max_acc = results_df['accuracy'].max()
        depth_at_90 = results_df[results_df['accuracy'] >= 0.90]['depth'].min()
        
        if pd.isna(depth_at_90):
            print(f"{race:20s}: Max accuracy {max_acc:.3f} (never reached 90%)")
        else:
            print(f"{race:20s}: Depth {depth_at_90:2d} needed for 90% accuracy")
    else:
        print(f"{race:20s}: No results available")


Checking available races in dataset:
race
White                 41762
Black                  4685
Asian-Pac-Islander     1519
Amer-Indian-Eskimo      470
Other                   406
Name: count, dtype: int64

=== Testing White (n=41762) ===
Class distribution: {0: 34645, 1: 7117}
Depth  1: Accuracy = 0.830
Depth  2: Accuracy = 0.830
Depth  3: Accuracy = 0.830
Depth  4: Accuracy = 0.833
Depth  5: Accuracy = 0.837
Depth  6: Accuracy = 0.836
Depth  7: Accuracy = 0.839
Depth  8: Accuracy = 0.841
Depth  9: Accuracy = 0.843
Depth 10: Accuracy = 0.844
Depth 11: Accuracy = 0.844
Depth 12: Accuracy = 0.845
Depth 13: Accuracy = 0.847
Depth 14: Accuracy = 0.845
Depth 15: Accuracy = 0.847
Depth 16: Accuracy = 0.845
Depth 17: Accuracy = 0.846
Depth 18: Accuracy = 0.846
Depth 19: Accuracy = 0.844
Depth 20: Accuracy = 0.844

=== Testing Black (n=4685) ===
Class distribution: {0: 4298, 1: 387}
Depth  1: Accuracy = 0.918
*** 90% accuracy reached at depth 1 for Black ***

=== Testing Asian-Pac-Islander 