In [None]:
import pandas as pd

column_names = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num',
    'marital-status', 'occupation', 'relationship', 'race', 'sex',
    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'
]

# Load training data
train = pd.read_csv("train.csv", names=column_names, sep=',\s*', engine='python')

# Load test data
test = pd.read_csv("test.csv", names=column_names, skiprows=1, sep=',\s*', engine='python')

# Check the first few rows
print(train.head())


   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country income  
0          2174             0              40  United-States  <=50K  
1             0             0             

In [4]:
import numpy as np
from sklearn.impute import SimpleImputer

test['income'] = test['income'].str.replace('.', '', regex=False)

# Replace '?' with np.nan
train.replace('?', np.nan, inplace=True)
test.replace('?', np.nan, inplace=True)
train.dropna(inplace=True)
test.dropna(inplace=True)

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Combine datasets for consistent label encoding
combined = pd.concat([train, test])

# Encode categorical variables
for col in combined.select_dtypes(include='object').columns:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col])

# Split back
train = combined.iloc[:len(train)]
test = combined.iloc[len(train):]

# Features and target
X_train = train.drop('income', axis=1)
y_train = train['income']
X_test = test.drop('income', axis=1)
y_test = test['income']


In [5]:
model = RandomForestClassifier(n_estimators=100, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.88      0.92      0.90     11360
           1       0.73      0.62      0.67      3700

    accuracy                           0.85     15060
   macro avg       0.80      0.77      0.79     15060
weighted avg       0.84      0.85      0.85     15060

