In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix

In [8]:
# Download Data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status","occupation", "relationship",
           "race", "sex", "capital-gain", "capital-loss","hours-per-week", "native-country", "income"]
data = pd.read_csv(url, names=columns, na_values=' ?', skipinitialspace=True)

In [9]:
# Preprocess the Data
# Drop rows with missing values
data = data.dropna()
# Encode categorical variables
data = pd.get_dummies(data, columns=["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"])
# Map income to binary values (0 for <=50K, 1 for >50K)
data['income'] = data['income'].map({'<=50K': 0, '>50K': 1})
# Split data into features (X) and target variable (y)
X = data.drop('income', axis=1)
y = data['income']

In [29]:
# Train Naive Bayes Classifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=53)
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)

In [30]:
# Predict and Compute Sensitivity and Specificity
y_pred = nb_classifier.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
print("Sensitivity:", sensitivity)
print("Specificity:", specificity)

Sensitivity: 0.30111524163568776
Specificity: 0.9518269034496836


In [32]:
# Compute Posterior Probability
posterior_probs = nb_classifier.predict_proba(X_test)
positive_class_probs = posterior_probs[:, 1]  # Probability of income >50K
print("Posterior Probability of earning over 50K a year:", positive_class_probs)

Posterior Probability of earning over 50K a year: [0.00205187 0.01043429 0.0013923  ... 0.0018677  0.02411883 0.01130518]
