In [4]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import xgboost as xgb

In [5]:
# fetch dataset 
adult = fetch_ucirepo(id=2) 

# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 

df = X.join(y)

# metadata 
print(adult.metadata) 
  
# variable information 
print(adult.variables)

# drop rows with missing values
df = df.dropna()

# drop duplicates
df = df.drop_duplicates()

# drop redundant columns
df = df.drop('education', axis=1)

# Format column, get rid of variations
df['income'] = df['income'].replace('>50K.', '>50K')
df['income'] = df['income'].replace('<=50K.', '<=50K')

# Binarize column
df['income'] = np.where(df['income'] == '>50K', 1, 0)
df.rename(columns={'income':'income>50K'}, inplace=True)

# Extract features and target
df3 = df.copy()
y3 = df3.iloc[:,-1:]
X3 = df3.iloc[:,1:-1]

# Create encoder
encoder = LabelEncoder()

# Encode categorical data
X3['sex'] = encoder.fit_transform(X3['sex'])

# One hot encode categorical data - marital-status
X3['marital-status'].replace(['Married-civ-spouse', 'Married-spouse-absent', 'Married-AF-spouse'], 'Married', inplace=True)
X3['marital-status'].replace(['Divorced', 'Separated', 'Widowed'], 'Divorced', inplace=True)
X3 = pd.get_dummies(X3, columns=['marital-status'], dtype=int)

# One hot encode categorical data - workclass
X3['workclass'].replace(['Self-emp-not-inc', 'Self-emp-inc'], 'self-emp', inplace=True)
X3['workclass'].replace(['Federal-gov', 'Local-gov', 'State-gov'], 'government', inplace=True)
X3['workclass'].replace(['Never-worked', 'Without-pay'], 'jobless', inplace=True)
X3 = pd.get_dummies(X3, columns=['workclass'], dtype=int)

# One hot encode categorical data - occupation
X3['occupation'].replace(['Tech-support', 'Craft-repair', 'Machine-op-inspct'], 'Technical/Support', inplace=True)
X3['occupation'].replace(['Other-service', 'Priv-house-serv', 'Protective-serv'], 'Service', inplace=True)
X3['occupation'].replace(['Exec-managerial', 'Adm-clerical'], 'Management/Administration', inplace=True)
X3['occupation'].replace(['Sales'], 'Sales', inplace=True)
X3['occupation'].replace(['Prof-specialty'], 'Professional', inplace=True)
X3['occupation'].replace(['Handlers-cleaners', 'Farming-fishing', 'Transport-moving'], 'Manual Labor', inplace=True)
X3['occupation'].replace(['Armed-Forces'], 'Specialized', inplace=True)
X3 = pd.get_dummies(X3, columns=['occupation'], dtype=int)

# One hot encode categorical data - relationship
X3['relationship'].replace(['Wife', 'Husband'], 'Spouse', inplace=True)
X3['relationship'].replace(['Own-child'], 'Child', inplace=True)
X3['relationship'].replace(['Other-relative'], 'Other Relatives', inplace=True)
X3['relationship'].replace(['Not-in-family', 'Unmarried'], 'Non-Family', inplace=True)
X3 = pd.get_dummies(X3, columns=['relationship'], dtype=int)

# One hot encode categorical data - relationship
X3['race'].replace(['White'], 'White', inplace=True)
X3['race'].replace(['Asian-Pac-Islander'], 'Asian/Pacific Islander', inplace=True)
X3['race'].replace(['Amer-Indian-Eskimo'], 'American Indian/Eskimo', inplace=True)
X3['race'].replace(['Black'], 'Black', inplace=True)
X3['race'].replace(['Other'], 'Other', inplace=True)
X3 = pd.get_dummies(X3, columns=['race'], dtype=int)

X3['native-country'].replace(['United-States', 'Canada', 'Outlying-US(Guam-USVI-etc)', 'Puerto-Rico', 'Mexico', 'Cuba', 'Jamaica', 'Dominican-Republic', 'Haiti', 'Guatemala', 'Honduras', 'El-Salvador', 'Nicaragua', 'Trinadad&Tobago', 'Panama'], 'North America', inplace=True)
X3['native-country'].replace(['England', 'Germany', 'Greece', 'Italy', 'Poland', 'Portugal', 'Ireland', 'France', 'Scotland', 'Yugoslavia', 'Hungary', 'Holand-Netherlands'], 'Europe', inplace=True)
X3['native-country'].replace(['Cambodia', 'India', 'Japan', 'China', 'Philippines', 'Vietnam', 'Taiwan', 'Laos', 'Iran', 'Thailand', 'Hong'], 'Asia', inplace=True)
X3['native-country'].replace(['Ecuador', 'Columbia', 'Peru'], 'Other', inplace=True)

X3 = pd.get_dummies(X3, columns=['native-country'], dtype=int)

display(X3.iloc[:4, :])

{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether income exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Mon Aug 07 2023', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': 'Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the following conditions: ((AAG

Unnamed: 0,fnlwgt,education-num,sex,capital-gain,capital-loss,hours-per-week,marital-status_Divorced,marital-status_Married,marital-status_Never-married,workclass_?,...,race_Asian/Pacific Islander,race_Black,race_Other,race_White,native-country_?,native-country_Asia,native-country_Europe,native-country_North America,native-country_Other,native-country_South
0,77516,13,1,2174,0,40,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0
1,83311,13,1,0,0,13,0,1,0,0,...,0,0,0,1,0,0,0,1,0,0
2,215646,9,1,0,0,40,1,0,0,0,...,0,0,0,1,0,0,0,1,0,0
3,234721,7,1,0,0,40,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0


In [7]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X3, y3, test_size=0.2, random_state=42)

# Convert the datasets to DMatrix, which is a high-performance XGBoost data structure
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Specify training parameters
params = {
    'max_depth': 3,  # Maximum depth of a tree
    'eta': 0.3,  # Learning rate
    'objective': 'binary:logistic',  # Binary classification
    'eval_metric': 'logloss',  # Evaluation metric for validation data
}
num_rounds = 100  # Number of boosting rounds

# Train the model
bst = xgb.train(params, dtrain, num_rounds)

# Predict on the testing set
y_pred_prob = bst.predict(dtest)
y_pred = np.where(y_pred_prob > 0.5, 1, 0)  # Convert probabilities to binary output

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8635360857232902
Confusion Matrix:
[[6798  376]
 [ 923 1422]]
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.95      0.91      7174
           1       0.79      0.61      0.69      2345

    accuracy                           0.86      9519
   macro avg       0.84      0.78      0.80      9519
weighted avg       0.86      0.86      0.86      9519

