The Adult Income dataset is a binary classification problem where the goal is to predict whether an individual's annual income exceeds 50,000 dollars (>50K) or not (<=50K) based on various personal attributes. This dataset is provided by the UCI Machine Learning Repository and is commonly used in machine learning tasks.

1. Loading and Preprocessing the Adult Income Dataset
The dataset includes various features such as age, occupation, education level, gender, and others. Since it contains categorical data, these need to be converted into numerical representations for use in machine learning models.

In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
import os
import numpy as np

PATH_DATA = 'Income_Dataset'
PATH_DATA_FILE = os.path.join(PATH_DATA, 'Dataset')

columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
           'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

if not os.path.exists(PATH_DATA):
    os.makedirs(PATH_DATA)
else:
    pass
if not os.path.isfile(PATH_DATA_FILE):
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
    income_data = pd.read_csv(url, names=columns, na_values=' ?', skipinitialspace=True)
    income_data.to_csv(PATH_DATA_FILE, index=False)
else:
    income_data = pd.read_csv(PATH_DATA_FILE)

print(income_data.head())

   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country income  
0          2174             0              40  United-States  <=50K  
1             0             0             

2. Data preprocessing
Remove rows with missing values

In [49]:
income_data.dropna(inplace=True)
print(income_data.head())

   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country income  
0          2174             0              40  United-States  <=50K  
1             0             0             

Convert categorical features to numerical using LabelEncoder

In [50]:
label_encoder = LabelEncoder()
income_data['workclass'] = label_encoder.fit_transform(income_data['workclass'])
income_data['education'] = label_encoder.fit_transform(income_data['education'])
income_data['marital-status'] = label_encoder.fit_transform(income_data['marital-status'])
income_data['occupation'] = label_encoder.fit_transform(income_data['occupation'])
income_data['relationship'] = label_encoder.fit_transform(income_data['relationship'])
income_data['race'] = label_encoder.fit_transform(income_data['race'])
income_data['sex'] = label_encoder.fit_transform(income_data['sex'])
income_data['native-country'] = label_encoder.fit_transform(income_data['native-country'])
income_data['income'] = label_encoder.fit_transform(income_data['income'])

print(f"workclass range:{min(income_data['workclass'])}~{max(income_data['workclass'])}")
print(f"education range:{min(income_data['education'])}~{max(income_data['education'])}")
print(f"marital-status range:{min(income_data['marital-status'])}~{max(income_data['marital-status'])}")
print(f"occupation range:{min(income_data['occupation'])}~{max(income_data['occupation'])}")
print(f"relationship range:{min(income_data['relationship'])}~{max(income_data['relationship'])}")
print(f"race range:{min(income_data['race'])}~{max(income_data['race'])}")
print(f"sex range:{min(income_data['sex'])}~{max(income_data['sex'])}")
print(f"native-country range:{min(income_data['native-country'])}~{max(income_data['native-country'])}")
print(f"income range:{min(income_data['income'])}~{max(income_data['income'])}")

print("Memory usage before conversion:")
# Check memory usage before conversion
print(f'{round(income_data.memory_usage(deep=True).sum() / 1024, 3)} KB')

# Convert multiple columns to smaller integer types
income_data['workclass'] = income_data['workclass'].astype('int8')
income_data['education'] = income_data['education'].astype('int8')
income_data['marital-status'] = income_data['marital-status'].astype('int8')
income_data['occupation'] = income_data['occupation'].astype('int8')
income_data['relationship'] = income_data['relationship'].astype('float16')
income_data['race'] = income_data['race'].astype('int8')
income_data['sex'] = income_data['sex'].astype('int8')
income_data['native-country'] = income_data['native-country'].astype('int8')
income_data['income'] = income_data['income'].astype('int8')

print("Memory usage after conversion to int8:")
# Check memory usage after conversion
print(f'{round(income_data.memory_usage(deep=True).sum() / 1024, 3)} KB')

print(income_data.head())

workclass range:0~8
education range:0~15
marital-status range:0~6
occupation range:0~14
relationship range:0~5
race range:0~4
sex range:0~1
native-country range:0~41
income range:0~1
Memory usage before conversion:
3815.867 KB
Memory usage after conversion to int8:
1844.4 KB
   age  workclass  fnlwgt  education  education-num  marital-status  \
0   39          7   77516          9             13               4   
1   50          6   83311          9             13               2   
2   38          4  215646         11              9               0   
3   53          4  234721          1              7               2   
4   28          4  338409          9             13               2   

   occupation  relationship  race  sex  capital-gain  capital-loss  \
0           1           1.0     4    1          2174             0   
1           4           0.0     4    1             0             0   
2           6           1.0     4    1             0             0   
3           6    

3. Split the data into features (X) and target (y)

In [51]:
X = income_data.drop('income', axis=1)  # Features
y = income_data['income']  # Target: income (0 for <=50K, 1 for >50K)

4. Split the dataset into training and testing sets (70% train, 30% test)

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

5. Create and train the Random Forest model

In [53]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

6. Make predictions on the test set

In [54]:
y_pred = model.predict(X_test)


7. Evaluate the model & Print the evaluation metrics

In [55]:
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

Accuracy: 0.8585320913092436
F1 Score: 0.6777052238805971
Precision: 0.7360688956433638
Recall: 0.6279170267934313


8. Generate random data for prediction
We will generate random values for each feature

In [57]:
random_data = {
    'age': np.random.randint(18, 90),
    'workclass': np.random.choice(income_data['workclass'].unique()),
    'fnlwgt': np.random.randint(10000, 1000000),
    'education': np.random.choice(income_data['education'].unique()),
    'education-num': np.random.randint(1, 16),
    'marital-status': np.random.choice(income_data['marital-status'].unique()),
    'occupation': np.random.choice(income_data['occupation'].unique()),
    'relationship': np.random.choice(income_data['relationship'].unique()),
    'race': np.random.choice(income_data['race'].unique()),
    'sex': np.random.choice(income_data['sex'].unique()),
    'capital-gain': np.random.randint(0, 10000),
    'capital-loss': np.random.randint(0, 5000),
    'hours-per-week': np.random.randint(1, 99),
    'native-country': np.random.choice(income_data['native-country'].unique())
}

# Convert the random data into a DataFrame
random_df = pd.DataFrame([random_data])

# 7. Predict using the trained model
# Ensure the random data is in the same format as the training data
random_prediction = model.predict(random_df)

# 8. Output the results
print("Randomly generated data:")
print(random_df)
print("\nPredicted income class (0: <=50K, 1: >50K):", random_prediction[0])

Randomly generated data:
   age  workclass  fnlwgt  education  education-num  marital-status  \
0   57          5  425387          4             15               6   

   occupation  relationship  race  sex  capital-gain  capital-loss  \
0          11           1.0     1    0          8332          2768   

   hours-per-week  native-country  
0              16              16  

Predicted income class (0: <=50K, 1: >50K): 1
