In [62]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd 
from sklearn.model_selection import train_test_split

In [63]:
# Read dataset
df = pd.read_csv("adult_data.csv", delimiter=",")

# Printing head of the data to visually view it
print(df.head)

<bound method NDFrame.head of        age         workclass  fnlwgt  education  education-num  \
0       39         State-gov   77516  Bachelors             13   
1       50  Self-emp-not-inc   83311  Bachelors             13   
2       38           Private  215646    HS-grad              9   
3       53           Private  234721       11th              7   
4       28           Private  338409  Bachelors             13   
...    ...               ...     ...        ...            ...   
48837   39           Private  215419  Bachelors             13   
48838   64               NaN  321403    HS-grad              9   
48839   38           Private  374983  Bachelors             13   
48840   44           Private   83891  Bachelors             13   
48841   35      Self-emp-inc  182148  Bachelors             13   

           marital-status         occupation    relationship  \
0           Never-married       Adm-clerical   Not-in-family   
1      Married-civ-spouse    Exec-managerial     

In [64]:
# Strip whitespaces and convert to lowercase
df.columns = df.columns.str.strip().str.lower()

# Define categorical columns
categorical_columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

# Check if all specified categorical columns exist in the DataFrame
for col in categorical_columns:
    if col not in df.columns:
        print(f"Column '{col}' not found in DataFrame.")


# Convert categorical columns to one-hot encoding
df_encoded = pd.get_dummies(df, columns=categorical_columns)

# Check the transformed DataFrame
print(df_encoded.head())


   age  fnlwgt  education-num  capital-gain  capital-loss  hours-per-week  \
0   39   77516             13          2174             0              40   
1   50   83311             13             0             0              13   
2   38  215646              9             0             0              40   
3   53  234721              7             0             0              40   
4   28  338409             13             0             0              40   

  income  workclass_?  workclass_Federal-gov  workclass_Local-gov  ...  \
0  <=50K        False                  False                False  ...   
1  <=50K        False                  False                False  ...   
2  <=50K        False                  False                False  ...   
3  <=50K        False                  False                False  ...   
4  <=50K        False                  False                False  ...   

   native-country_Portugal  native-country_Puerto-Rico  \
0                    False        

In [65]:
# Define the features and target variable
X = df_encoded.drop('income', axis=1)  # All columns except 'income'
y = df_encoded['income']  # The target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")


Training set size: 39073
Testing set size: 9769


In [66]:
from sklearn.naive_bayes import MultinomialNB

# Create a Multinomial Naive Bayes model
model = MultinomialNB()

# Train the model on the training data
model.fit(X_train, y_train)

In [67]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy score, formatted to two decimal places
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.79


In [68]:
from sklearn.metrics import confusion_matrix

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[7140  274]
 [1812  543]]


In [69]:
unique_classes_y_test = pd.Series(y_test).unique()
print("Unique classes in y_test:", unique_classes_y_test)

unique_classes_y_pred = pd.Series(y_pred).unique()
print("Unique classes in y_pred:", unique_classes_y_pred)


Unique classes in y_test: ['<=50K' '>50K']
Unique classes in y_pred: ['<=50K' '>50K']


In [70]:
class_distribution_y_test = pd.Series(y_test).value_counts()
print("Class distribution in y_test:")
print(class_distribution_y_test)

class_distribution_y_pred = pd.Series(y_pred).value_counts()
print("Class distribution in y_pred:")
print(class_distribution_y_pred)


Class distribution in y_test:
income
<=50K    7414
>50K     2355
Name: count, dtype: int64
Class distribution in y_pred:
<=50K    8952
>50K      817
Name: count, dtype: int64


In [71]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

# Calculate evaluation metrics for the multiclass classification problem
precision = precision_score(y_test, y_pred, pos_label='>50K')
recall = recall_score(y_test, y_pred, pos_label='>50K')
f1 = f1_score(y_test, y_pred, pos_label='>50K')

# Print the evaluation metrics
print(f"Precision (weighted): {precision:.2f}")
print(f"Recall (weighted): {recall:.2f}")
print(f"F1 Score (weighted): {f1:.2f}")

Confusion Matrix:
[[7140  274]
 [1812  543]]
Precision (weighted): 0.66
Recall (weighted): 0.23
F1 Score (weighted): 0.34


In [72]:
import numpy as np

cm = confusion_matrix(y_test, y_pred)

# Initialize lists to store sensitivity and specificity for each class
sensitivities = []
specificities = []

# Calculate sensitivity and specificity for each class
for i in range(cm.shape[0]):
    TP = cm[i, i]
    FN = cm[i, :].sum() - TP
    FP = cm[:, i].sum() - TP
    TN = cm.sum() - (TP + FP + FN)
    
    sensitivity = TP / (TP + FN)
    specificity = TN / (TN + FP)
    
    sensitivities.append(sensitivity)
    specificities.append(specificity)

# Print the sensitivity and specificity for each class
for i in range(len(sensitivities)):
    print(f"Class {i}:")
    print(f"Sensitivity (Recall): {sensitivities[i]:.2f}")
    print(f"Specificity: {specificities[i]:.2f}")


Class 0:
Sensitivity (Recall): 0.96
Specificity: 0.23
Class 1:
Sensitivity (Recall): 0.23
Specificity: 0.96


In [73]:
# Predict the probabilities for each class
proba = model.predict_proba(X_test)

# Extract the probability of making over $50,000 a year
# The second column (index 1) contains the probability of the 'over $50K' class
posterior_probabilities = proba[:, 1]

# Print the posterior probabilities
print("Posterior probabilities of making over $50,000 a year:")
print(posterior_probabilities)


Posterior probabilities of making over $50,000 a year:
[0. 0. 0. ... 0. 0. 0.]
