In [78]:
from typing import List, Tuple
from helper.helper_functions import load_dataset, save_model, get_features_and_target, encode_all_features
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import GaussianNB

from mixed_naive_bayes import MixedNB # https://pypi.org/project/mixed-naive-bayes/#api-documentation

### Loading the cleaned dataset

In [79]:
data = load_dataset('../data/assignment2_income_cleaned.xlsx')

### Feature Engineering (encoding) & Train-Test Split

In [80]:
# Splitting the data into features (X) and target (y)
X, y = get_features_and_target(data, 'income')
# Encoding the features and target, and excluding some columns
X_encoded, y_encoded = encode_all_features(X, y, [])
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

In [81]:
X_train.head()

Unnamed: 0,age,education,workinghours,ability to speak english,occupation_Construction/Extraction,occupation_Counseling/Mental Health Services,occupation_Education,occupation_Entertainment,"occupation_Farming, Fishing, Forestry",occupation_Finance/Accounting,...,sex_Female,sex_Male,marital status_Divorced,marital status_Husband,marital status_Never married,marital status_Separated,marital status_Widowed,marital status_Wife,gave birth this year_No,gave birth this year_Yes
6317,22,16,36,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0
740,61,22,40,1,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,1,0
3781,48,16,40,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,1,0
7850,62,18,65,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0
2963,53,19,44,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,1,0


### Model

Here, we quickly train and evaluate a Gaussian Naive Bayes model for demonstration.

In [82]:
# Naive Bayes model (Gaussian)
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_preds = nb_model.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_preds)

print(classification_report(y_test, nb_preds))
print("Naive Bayes Accuracy:", nb_accuracy)

              precision    recall  f1-score   support

           0       0.82      0.73      0.78      1175
           1       0.58      0.71      0.64       625

    accuracy                           0.72      1800
   macro avg       0.70      0.72      0.71      1800
weighted avg       0.74      0.72      0.73      1800

Naive Bayes Accuracy: 0.7227777777777777


### Combined Naive Bayes Model: Custom Implementation

Below we implement and train a Combined Naive Bayes model for mixed data types (categorical and numerical).

In [83]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import GaussianNB, CategoricalNB

class CombinedNB:
    """
    Combined Naive Bayes model for mixed data types (categorical and numerical)
    This looks a bit like a sklearn classifier class, but it's not (it doesn't inherit from anything). It's just a simple class that implements the fit and predict methods only, for two combined Naive Bayes models.
    """
    def __init__(self, num_feat: List[str], cat_model: CategoricalNB = CategoricalNB(), num_model: GaussianNB = GaussianNB()):
        self.cat_model: CategoricalNB = cat_model
        self.num_model: GaussianNB = num_model
        self.num_feat: List[str] = num_feat # numerical features

    def fit(self, X_train, y_train):
        """
        Fit the combined model to the training data. We fit the categorical model to the categorical features and the numerical model to the numerical features.
        :param X_train: training dataset with mixed features
        :param y_train: training dataset with target variable
        :return: 
        """
        # Splitting data into categorical and numerical features
        X_train_cat = X_train.drop(columns=self.num_feat)
        X_train_num = X_train[self.num_feat]

        self.cat_model.fit(X_train_cat, y_train)
        self.num_model.fit(X_train_num, y_train)

    def predict(self, X_test):
        """
        Predict the target variable for the test data.
        
        We use the formula: 
        P(Class | C1, ..., Cn, N1, ..., Nm) ~= Mult{1, n}(P(Ci | Class)) * Mult{1, m}(P(Nj | Class)) * P(Class)
        
        where Ci are the categorical features, Nj are the numerical features, and Class is the target variable. 
        Concerning the scores, we get the following:
        Cat * Num * Prior = (Score_cat * Score_num) / Prior

        Note: CategoricalNB uses Laplace smoothing by default.
        
        :param X_test: test dataset with mixed features
        :return: predicted target variable
        """
        cat_probs = self.cat_model.predict_proba(X_test.drop(columns=self.num_feat))
        num_probs = self.num_model.predict_proba(X_test[self.num_feat])
        combined_probs = (cat_probs * num_probs) / self.num_model.class_prior_
        # combined_probs /= combined_probs.sum(axis=1, keepdims=True) # normalize the probabilities
        return np.argmax(combined_probs, axis=1)

#### Testing the Combined Naive Bayes Model

Below is a small test function for the Combined Naive Bayes model. We will test the model on a small dataset to check if it's generally working correctly. The test is based on the example from the lecture slides by Toon Calders.

In [84]:
def test_combined_nb_model(print_mapping=False):
    data_ = {
    'Refund': ['Yes', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No'],
    'Marital Status': ['Single', 'Married', 'Single', 'Married', 'Divorced', 'Married', 'Divorced', 'Single', 'Married', 'Single', 'Divorced'],
    'Taxable Income': [125, 100, 70, 120, 95, 60, 220, 85, 75, 90, 120],
    'Evade': ['No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'No', 'Yes', 'No']
    }
    
    df = pd.DataFrame(data_)
    
    # Encoding features and target
    le_ = LabelEncoder()
    for col in df.columns:
        if col not in ['Taxable Income']:
            enc = le_.fit_transform(df[col])
            df[col] = enc
            # print encoding mapping
            if print_mapping:
                print(f"Mapping for {col}:")
                for original, encoded in zip(le_.classes_, range(len(le_.classes_))):
                    print(f"{original} -> {encoded}")
    
    # Separate features (X) and target variable (y)
    X_ = df.iloc[:, :-1]
    y_ = df.iloc[:, -1]
    
    # Splitting data into training and test sets
    X_train_ = X_.iloc[:-1]
    X_test_ = X_.iloc[-1:]
    y_train_ = y_.iloc[:-1]
    y_test_ = y_.iloc[-1:]
    
    comb_nb = CombinedNB(num_feat=['Taxable Income'])
    comb_nb.fit(X_train_, y_train_)
    comb_nb_preds = comb_nb.predict(X_test_)
        
    assert (comb_nb.cat_model.category_count_[0][0] == [4., 3.]).all()
    assert (comb_nb.cat_model.category_count_[0][1] == [3., 0.]).all()
    assert (comb_nb.cat_model.category_count_[1][0] == [1., 4., 2.]).all()
    assert (comb_nb.cat_model.category_count_[1][1] == [1., 0., 2.]).all()
    assert (comb_nb.num_model.class_count_ == np.array([7., 3.])).all()
    assert (comb_nb.num_model.class_prior_ == np.array([0.7, 0.3])).all()
    assert (comb_nb.num_model.theta_ == np.array([[110.], [ 90.]])).all()
    assert (np.round(comb_nb.num_model.var_) == np.round(np.array([[2550.00000187], [16.66666854]]))).all()
    assert comb_nb_preds == 0
    
    # convert log probabilities to probabilities
    feature_probs_0 = np.exp(comb_nb.cat_model.feature_log_prob_[0])
    feature_probs_1 = np.exp(comb_nb.cat_model.feature_log_prob_[1])
    
    assert (np.round(feature_probs_0[0], 2) == np.round(np.array([0.55555556, 0.44444444]), 2)).all()
    assert (np.round(feature_probs_0[1], 2) == np.round(np.array([0.8, 0.2]), 2)).all()
    assert (np.round(feature_probs_1[0], 2) == np.round(np.array([0.2, 0.5, 0.3]), 2)).all()
    assert (np.round(feature_probs_1[1], 2) == np.round(np.array([0.33333333, 0.16666667, 0.5]), 2)).all()
    
    print("All tests passed successfully!")

In [85]:
test_combined_nb_model()

All tests passed successfully!


#### Training and Evaluating the Combined Naive Bayes Model

Since Naive Bayes uses the independence assumption, one-hot encoding features is not a smart idea. We will use label encoding for all categorical features. 

In [86]:
X_encoded = X.copy()
# encode all categorical features using label encoding
le = LabelEncoder()
for col in X_encoded.columns:
    if col not in ['education', 'ability to speak english', 'age', 'workinghours']:
        X_encoded[col] = le.fit_transform(X[col])
                
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

comb_nb = CombinedNB(num_feat=['age', 'workinghours'])
comb_nb.fit(X_train, y_train)
comb_nb_preds = comb_nb.predict(X_test)
comb_nb_accuracy = accuracy_score(y_test, comb_nb_preds)

print("Combined Naive Bayes Accuracy:", comb_nb_accuracy)
print(classification_report(y_test, comb_nb_preds))

Combined Naive Bayes Accuracy: 0.7738888888888888
              precision    recall  f1-score   support

           0       0.84      0.81      0.82      1175
           1       0.66      0.72      0.69       625

    accuracy                           0.77      1800
   macro avg       0.75      0.76      0.76      1800
weighted avg       0.78      0.77      0.78      1800


Try a different subset of features for the Combined Naive Bayes model.

In [87]:
X_encoded = X.copy()
columns_to_exclude = ['age', 'ability to speak english', 'workclass']
X_encoded = X_encoded.drop(columns=columns_to_exclude)
# encode all categorical features using label encoding
le = LabelEncoder()
for col in X_encoded.columns:
    if col not in ['education', 'ability to speak english', 'age', 'workinghours']:
        X_encoded[col] = le.fit_transform(X[col])
                
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

comb_nb = CombinedNB(num_feat=['workinghours'])
comb_nb.fit(X_train, y_train)
comb_nb_preds = comb_nb.predict(X_test)
comb_nb_accuracy = accuracy_score(y_test, comb_nb_preds)

print("Combined Naive Bayes Accuracy:", comb_nb_accuracy)
print(classification_report(y_test, comb_nb_preds))

Combined Naive Bayes Accuracy: 0.7716666666666666
              precision    recall  f1-score   support

           0       0.82      0.83      0.83      1175
           1       0.67      0.67      0.67       625

    accuracy                           0.77      1800
   macro avg       0.75      0.75      0.75      1800
weighted avg       0.77      0.77      0.77      1800


In [88]:
X_encoded = X.copy()
columns_to_exclude = ['gave birth this year', 'marital status']
X_encoded = X_encoded.drop(columns=columns_to_exclude)
# encode all categorical features using label encoding
le = LabelEncoder()
for col in X_encoded.columns:
    if col not in ['education', 'ability to speak english', 'age', 'workinghours']:
        X_encoded[col] = le.fit_transform(X[col])
                
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

comb_nb = CombinedNB(num_feat=['age', 'workinghours'])
comb_nb.fit(X_train, y_train)
comb_nb_preds = comb_nb.predict(X_test)
comb_nb_accuracy = accuracy_score(y_test, comb_nb_preds)

print("Combined Naive Bayes Accuracy:", comb_nb_accuracy)
print(classification_report(y_test, comb_nb_preds))

Combined Naive Bayes Accuracy: 0.7838888888888889
              precision    recall  f1-score   support

           0       0.82      0.85      0.84      1175
           1       0.70      0.65      0.68       625

    accuracy                           0.78      1800
   macro avg       0.76      0.75      0.76      1800
weighted avg       0.78      0.78      0.78      1800


In [89]:
X_encoded = X.copy()
columns_to_exclude = ['sex', 'gave birth this year', 'marital status']
X_encoded = X_encoded.drop(columns=columns_to_exclude)
# encode all categorical features using label encoding
le = LabelEncoder()
for col in X_encoded.columns:
    if col not in ['education', 'ability to speak english', 'age', 'workinghours']:
        X_encoded[col] = le.fit_transform(X[col])
                
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

comb_nb = CombinedNB(num_feat=['age', 'workinghours'])
comb_nb.fit(X_train, y_train)
comb_nb_preds = comb_nb.predict(X_test)
comb_nb_accuracy = accuracy_score(y_test, comb_nb_preds)

print("Combined Naive Bayes Accuracy:", comb_nb_accuracy)
print(classification_report(y_test, comb_nb_preds))

Combined Naive Bayes Accuracy: 0.7538888888888889
              precision    recall  f1-score   support

           0       0.80      0.84      0.82      1175
           1       0.66      0.60      0.63       625

    accuracy                           0.75      1800
   macro avg       0.73      0.72      0.72      1800
weighted avg       0.75      0.75      0.75      1800


#### Mixed Naive Bayes Model: library implementation

I only later found out this library implementation of Mixed Naive Bayes, although I don't know its internal wokrings. We can use it too to compare the results.

In [90]:
X_encoded = X.copy()
# encode all categorical features using label encoding
le = LabelEncoder()
for col in X_encoded.columns:
    if col not in ['education', 'ability to speak english', 'age', 'workinghours']:
        X_encoded[col] = le.fit_transform(X[col])
                
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

# the library implementation of MixedNB requires the categorical features to be encoded as integers starting from 0, so we need to encode the education feature separately.
le = LabelEncoder()
X_train['education'] = le.fit_transform(X_train['education'])
X_test['education'] = le.transform(X_test['education'])

comb_nb = MixedNB(categorical_features=[1,2,3,4,6,7,8]) # indices of categorical features
comb_nb.fit(X_train, y_train)
comb_nb_preds = comb_nb.predict(X_test)
comb_nb_accuracy = accuracy_score(y_test, comb_nb_preds)

print("Mixed Naive Bayes Accuracy:", comb_nb_accuracy)
print(classification_report(y_test, comb_nb_preds))

Mixed Naive Bayes Accuracy: 0.7755555555555556
              precision    recall  f1-score   support

           0       0.84      0.81      0.82      1175
           1       0.66      0.72      0.69       625

    accuracy                           0.78      1800
   macro avg       0.75      0.76      0.76      1800
weighted avg       0.78      0.78      0.78      1800


In [91]:
X_encoded = X.copy()
columns_to_exclude = ['age', 'ability to speak english', 'workclass']
X_encoded = X_encoded.drop(columns=columns_to_exclude)
# encode all categorical features using label encoding
le = LabelEncoder()
for col in X_encoded.columns:
    if col not in ['education', 'ability to speak english', 'age', 'workinghours']:
        X_encoded[col] = le.fit_transform(X[col])
                
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

# the library implementation of MixedNB requires the categorical features to be encoded as integers starting from 0, so we need to encode the education feature separately.
le = LabelEncoder()
X_train['education'] = le.fit_transform(X_train['education'])
X_test['education'] = le.transform(X_test['education'])
comb_nb = MixedNB(categorical_features=[0,1,2,4,5]) # indices of categorical features
comb_nb.fit(X_train, y_train)
comb_nb_preds = comb_nb.predict(X_test)
comb_nb_accuracy = accuracy_score(y_test, comb_nb_preds)

print("Mixed Naive Bayes Accuracy:", comb_nb_accuracy)
print(classification_report(y_test, comb_nb_preds))

Mixed Naive Bayes Accuracy: 0.7711111111111111
              precision    recall  f1-score   support

           0       0.82      0.83      0.82      1175
           1       0.67      0.67      0.67       625

    accuracy                           0.77      1800
   macro avg       0.75      0.75      0.75      1800
weighted avg       0.77      0.77      0.77      1800


In [92]:
X_encoded = X.copy()
columns_to_exclude = ['sex', 'gave birth this year', 'marital status']
X_encoded = X_encoded.drop(columns=columns_to_exclude)
# encode all categorical features using label encoding
le = LabelEncoder()
for col in X_encoded.columns:
    if col not in ['education', 'ability to speak english', 'age', 'workinghours']:
        X_encoded[col] = le.fit_transform(X[col])
                
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

# the library implementation of MixedNB requires the categorical features to be encoded as integers starting from 0, so we need to encode the education feature separately.
le = LabelEncoder()
X_train['education'] = le.fit_transform(X_train['education'])
X_test['education'] = le.transform(X_test['education'])

comb_nb = MixedNB(categorical_features=[1,2,3,5]) # indices of categorical features
comb_nb.fit(X_train, y_train)
comb_nb_preds = comb_nb.predict(X_test)
comb_nb_accuracy = accuracy_score(y_test, comb_nb_preds)

print("Mixed Naive Bayes Accuracy:", comb_nb_accuracy)
print(classification_report(y_test, comb_nb_preds))

Mixed Naive Bayes Accuracy: 0.7533333333333333
              precision    recall  f1-score   support

           0       0.80      0.84      0.82      1175
           1       0.66      0.60      0.63       625

    accuracy                           0.75      1800
   macro avg       0.73      0.72      0.72      1800
weighted avg       0.75      0.75      0.75      1800


In [93]:
X_encoded = X.copy()
columns_to_exclude = ['gave birth this year', 'marital status']
X_encoded = X_encoded.drop(columns=columns_to_exclude)
# encode all categorical features using label encoding
le = LabelEncoder()
for col in X_encoded.columns:
    if col not in ['education', 'ability to speak english', 'age', 'workinghours']:
        X_encoded[col] = le.fit_transform(X[col])
                
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

# the library implementation of MixedNB requires the categorical features to be encoded as integers starting from 0, so we need to encode the education feature separately.
le = LabelEncoder()
X_train['education'] = le.fit_transform(X_train['education'])
X_test['education'] = le.transform(X_test['education'])

comb_nb = MixedNB(categorical_features=[1,2,3,5,6]) # indices of categorical features
comb_nb.fit(X_train, y_train)
comb_nb_preds = comb_nb.predict(X_test)
comb_nb_accuracy = accuracy_score(y_test, comb_nb_preds)

print("Mixed Naive Bayes Accuracy:", comb_nb_accuracy)
print(classification_report(y_test, comb_nb_preds))

Mixed Naive Bayes Accuracy: 0.7844444444444445
              precision    recall  f1-score   support

           0       0.82      0.85      0.84      1175
           1       0.70      0.65      0.68       625

    accuracy                           0.78      1800
   macro avg       0.76      0.75      0.76      1800
weighted avg       0.78      0.78      0.78      1800


### Categorical Naive Bayes Model

Finally, we can try categorizing the numerical features and training a Categorical Naive Bayes model on a dataset that now has only categorical features.

In [94]:
X_encoded = X.copy()
# we categorize the age and workinghours features, so that all features are categorical
X_encoded = X_encoded.drop(columns=['age', 'workinghours'])
X_encoded['age'] = pd.cut(X['age'], bins=[0,28,38,49,65,93], labels=['(17-28]', '(28-38]', '(38-49]', '(49-65]', '(65-93]'])
X_encoded['workinghours'] = pd.cut(X['workinghours'], bins=[0, 30, 40, 99], labels=['Part-time', 'Full-time', 'Overtime'])

# encode all categorical features using label encoding
le = LabelEncoder()
for col in X_encoded.columns:
    if col not in ['education', 'ability to speak english']:
        X_encoded[col] = le.fit_transform(X[col])

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

nb_cat = CategoricalNB()
nb_cat.fit(X_train, y_train)
nb_preds = nb_cat.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_preds)

print(classification_report(y_test, nb_preds))
print("Naive Bayes Accuracy:", nb_accuracy)

              precision    recall  f1-score   support

           0       0.85      0.80      0.83      1175
           1       0.67      0.74      0.70       625

    accuracy                           0.78      1800
   macro avg       0.76      0.77      0.76      1800
weighted avg       0.79      0.78      0.78      1800

Naive Bayes Accuracy: 0.7811111111111111


In [95]:
X_encoded = X.copy()
columns_to_exclude = ['age', 'ability to speak english', 'workclass']
X_encoded = X_encoded.drop(columns=columns_to_exclude)
# we categorize the age and workinghours features, so that all features are categorical
X_encoded = X_encoded.drop(columns=['workinghours'])
X_encoded['workinghours'] = pd.cut(X['workinghours'], bins=[0, 30, 40, 99], labels=['Part-time', 'Full-time', 'Overtime'])

# encode all categorical features using label encoding
le = LabelEncoder()
for col in X_encoded.columns:
    if col not in ['education', 'ability to speak english']:
        X_encoded[col] = le.fit_transform(X[col])

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

nb_cat = CategoricalNB()
nb_cat.fit(X_train, y_train)
nb_preds = nb_cat.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_preds)

print(classification_report(y_test, nb_preds))
print("Naive Bayes Accuracy:", nb_accuracy)

              precision    recall  f1-score   support

           0       0.83      0.83      0.83      1175
           1       0.68      0.68      0.68       625

    accuracy                           0.78      1800
   macro avg       0.76      0.76      0.76      1800
weighted avg       0.78      0.78      0.78      1800

Naive Bayes Accuracy: 0.7788888888888889


In [96]:
X_encoded = X.copy()
columns_to_exclude = ['sex', 'gave birth this year', 'marital status']
X_encoded = X_encoded.drop(columns=columns_to_exclude)
# we categorize the age and workinghours features, so that all features are categorical
X_encoded = X_encoded.drop(columns=['age', 'workinghours'])
X_encoded['age'] = pd.cut(X['age'], bins=[0,28,38,49,65,93], labels=['(17-28]', '(28-38]', '(38-49]', '(49-65]', '(65-93]'])
X_encoded['workinghours'] = pd.cut(X['workinghours'], bins=[0, 30, 40, 99], labels=['Part-time', 'Full-time', 'Overtime'])

# encode all categorical features using label encoding
le = LabelEncoder()
for col in X_encoded.columns:
    if col not in ['education', 'ability to speak english']:
        X_encoded[col] = le.fit_transform(X[col])

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

nb_cat = CategoricalNB()
nb_cat.fit(X_train, y_train)
nb_preds = nb_cat.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_preds)

print(classification_report(y_test, nb_preds))
print("Naive Bayes Accuracy:", nb_accuracy)

              precision    recall  f1-score   support

           0       0.81      0.83      0.82      1175
           1       0.67      0.63      0.65       625

    accuracy                           0.76      1800
   macro avg       0.74      0.73      0.73      1800
weighted avg       0.76      0.76      0.76      1800

Naive Bayes Accuracy: 0.7622222222222222


In [97]:
X_encoded = X.copy()
columns_to_exclude = ['gave birth this year', 'marital status']
X_encoded = X_encoded.drop(columns=columns_to_exclude)
# we categorize the age and workinghours features, so that all features are categorical
X_encoded = X_encoded.drop(columns=['age', 'workinghours'])
X_encoded['age'] = pd.cut(X['age'], bins=[0,28,38,49,65,93], labels=['(17-28]', '(28-38]', '(38-49]', '(49-65]', '(65-93]'])
X_encoded['workinghours'] = pd.cut(X['workinghours'], bins=[0, 30, 40, 99], labels=['Part-time', 'Full-time', 'Overtime'])

# encode all categorical features using label encoding
le = LabelEncoder()
for col in X_encoded.columns:
    if col not in ['education', 'ability to speak english']:
        X_encoded[col] = le.fit_transform(X[col])

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

nb_cat = CategoricalNB()
nb_cat.fit(X_train, y_train)
nb_preds = nb_cat.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_preds)

print(classification_report(y_test, nb_preds))
print("Naive Bayes Accuracy:", nb_accuracy)

              precision    recall  f1-score   support

           0       0.82      0.84      0.83      1175
           1       0.68      0.66      0.67       625

    accuracy                           0.78      1800
   macro avg       0.75      0.75      0.75      1800
weighted avg       0.77      0.78      0.77      1800

Naive Bayes Accuracy: 0.7755555555555556


### Hyperparameter Tuning

Naive Bayes models do not have many hyperparameters to tune. We won't do any hyperparameter tuning for any models here.

### Saving the Model(s)

In [98]:
# save model
save_model(nb_model, '../output/saved_models/naive_bayes_model.joblib')