In [1]:
import pandas as pd

# Read dataset
df = pd.read_csv("adult_data.csv")

# Printing head of the data to visually view it
df.head(20)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [2]:
df.dropna(inplace=True)

def convert_income(x):
    return 1 if x.strip() == '>50K' else 0

df['income'] = df['income'].apply(convert_income)

# Convert categorical columns to one-hot encoding
categorical_columns = ['workclass', 'education', 'relationship', 'race', 'marital-status', 'occupation', 'sex', 'native-country']
df = pd.get_dummies(df, columns=categorical_columns)

print(df.head())


   age  fnlwgt  education-num  capital-gain  capital-loss  hours-per-week  \
0   39   77516             13          2174             0              40   
1   50   83311             13             0             0              13   
2   38  215646              9             0             0              40   
3   53  234721              7             0             0              40   
4   28  338409             13             0             0              40   

   income  workclass_?  workclass_Federal-gov  workclass_Local-gov  ...  \
0       0        False                  False                False  ...   
1       0        False                  False                False  ...   
2       0        False                  False                False  ...   
3       0        False                  False                False  ...   
4       0        False                  False                False  ...   

   native-country_Portugal  native-country_Puerto-Rico  \
0                    False  

In [3]:
from sklearn.model_selection import train_test_split

# Define the features and target variable
X = df.drop('income', axis=1)  # All columns except 'income'
y = df['income']  # The target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")


Training set size: 38096
Testing set size: 9525


In [4]:
from sklearn.naive_bayes import GaussianNB

# Create a Multinomial Naive Bayes model
model = GaussianNB()

# Train the model on the training data
model.fit(X_train, y_train)

In [5]:
from sklearn.metrics import accuracy_score

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy score, formatted to two decimal places
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.79


In [6]:
from sklearn.metrics import confusion_matrix

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[6806  364]
 [1648  707]]


In [7]:
unique_classes_y_test = pd.Series(y_test).unique()
print("Unique classes in y_test:", unique_classes_y_test)

unique_classes_y_pred = pd.Series(y_pred).unique()
print("Unique classes in y_pred:", unique_classes_y_pred)


Unique classes in y_test: [0 1]
Unique classes in y_pred: [0 1]


In [8]:
class_distribution_y_test = pd.Series(y_test).value_counts()
print("Class distribution in y_test:")
print(class_distribution_y_test)

class_distribution_y_pred = pd.Series(y_pred).value_counts()
print("Class distribution in y_pred:")
print(class_distribution_y_pred)


Class distribution in y_test:
income
0    7170
1    2355
Name: count, dtype: int64
Class distribution in y_pred:
0    8454
1    1071
Name: count, dtype: int64


In [9]:
from sklearn.metrics import confusion_matrix, classification_report

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

print(classification_report(y_test, y_pred))

Confusion Matrix:
[[6806  364]
 [1648  707]]
              precision    recall  f1-score   support

           0       0.81      0.95      0.87      7170
           1       0.66      0.30      0.41      2355

    accuracy                           0.79      9525
   macro avg       0.73      0.62      0.64      9525
weighted avg       0.77      0.79      0.76      9525



In [10]:
cm = confusion_matrix(y_test, y_pred)

# Initialize lists to store sensitivity and specificity for each class
sensitivities = []
specificities = []

# Calculate sensitivity and specificity for each class
for i in range(cm.shape[0]):
    TP = cm[i, i]
    FN = cm[i, :].sum() - TP
    FP = cm[:, i].sum() - TP
    TN = cm.sum() - (TP + FP + FN)
    
    sensitivity = TP / (TP + FN)
    specificity = TN / (TN + FP)
    
    sensitivities.append(sensitivity)
    specificities.append(specificity)

# Print the sensitivity and specificity for each class
for i in range(len(sensitivities)):
    print(f"Class {i}:")
    print(f"Sensitivity (Recall): {sensitivities[i]:.2f}")
    print(f"Specificity: {specificities[i]:.2f}")


Class 0:
Sensitivity (Recall): 0.95
Specificity: 0.30
Class 1:
Sensitivity (Recall): 0.30
Specificity: 0.95


In [11]:
# Predict the probabilities for each class
proba = model.predict_proba(X_test)[:, 1]

# Print the posterior probabilities
print("Posterior probabilities of making over $50,000 a year:")
print(proba)


Posterior probabilities of making over $50,000 a year:
[0.00928059 0.00264511 0.0116557  ... 0.00603148 1.         0.01547093]
