In [1]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split


# Preprocessing

In [2]:
df = pd.read_csv('adult.csv')
df = df.replace('?', pd.NA) # Replace '?' with NaN values
print(df.head())

   39          State-gov   77516   Bachelors   13        Never-married  \
0  50   Self-emp-not-inc   83311   Bachelors   13   Married-civ-spouse   
1  38            Private  215646     HS-grad    9             Divorced   
2  53            Private  234721        11th    7   Married-civ-spouse   
3  28            Private  338409   Bachelors   13   Married-civ-spouse   
4  37            Private  284582     Masters   14   Married-civ-spouse   

         Adm-clerical   Not-in-family   White     Male   2174   0   40  \
0     Exec-managerial         Husband   White     Male      0   0   13   
1   Handlers-cleaners   Not-in-family   White     Male      0   0   40   
2   Handlers-cleaners         Husband   Black     Male      0   0   40   
3      Prof-specialty            Wife   Black   Female      0   0   40   
4     Exec-managerial            Wife   White   Female      0   0   40   

    United-States   <=50K  
0   United-States   <=50K  
1   United-States   <=50K  
2   United-States   <=50K 

In [3]:
names = list(df.columns)
names

['39',
 ' State-gov',
 ' 77516',
 ' Bachelors',
 ' 13',
 ' Never-married',
 ' Adm-clerical',
 ' Not-in-family',
 ' White',
 ' Male',
 ' 2174',
 ' 0',
 ' 40',
 ' United-States',
 ' <=50K']

In [4]:
types = df.dtypes
types

39                 int64
 State-gov        object
 77516             int64
 Bachelors        object
 13                int64
 Never-married    object
 Adm-clerical     object
 Not-in-family    object
 White            object
 Male             object
 2174              int64
 0                 int64
 40                int64
 United-States    object
 <=50K            object
dtype: object

In [6]:
le = LabelEncoder()
for i in range(len(types)):
    if types[i]=='object':
        le.fit_transform(df[names[i]])
        df[names[i]] = le.transform(df[names[i]])

print(df.head())

   39   State-gov   77516   Bachelors   13   Never-married   Adm-clerical  \
0  50           6   83311           9   13               2              4   
1  38           4  215646          11    9               0              6   
2  53           4  234721           1    7               2              6   
3  28           4  338409           9   13               2             10   
4  37           4  284582          12   14               2              4   

    Not-in-family   White   Male   2174   0   40   United-States   <=50K  
0               0       4      1      0   0   13              39       0  
1               1       4      1      0   0   40              39       0  
2               0       2      1      0   0   40              39       0  
3               5       2      0      0   0   40               5       0  
4               5       4      0      0   0   40              39       0  


  if types[i]=='object':
  if types[i]=='object':
  if types[i]=='object':
  if types[i]=='object':
  if types[i]=='object':
  if types[i]=='object':
  if types[i]=='object':
  if types[i]=='object':
  if types[i]=='object':


In [7]:
data = df.values
x = data[:,:-1]
y = data[:,-1]
x.shape, y.shape

((32560, 14), (32560,))

In [8]:
scaler = MinMaxScaler()
scaler.fit(x)
x_scaled = scaler.transform(x)
x_scaled[0]

array([0.45205479, 0.75      , 0.0482376 , 0.6       , 0.8       ,
       0.33333333, 0.28571429, 0.        , 1.        , 1.        ,
       0.        , 0.        , 0.12244898, 0.95121951])

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(x_scaled, y, random_state = 42, test_size=0.25)

# Gaussian Naive Bayes

In [10]:
gnb = GaussianNB()
gnb.fit(X_train, Y_train)

In [11]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

gnb_pred = gnb.predict(X_test)

print(confusion_matrix(Y_test, gnb_pred))
print(classification_report(Y_test, gnb_pred))

[[5887  270]
 [1286  697]]
              precision    recall  f1-score   support

           0       0.82      0.96      0.88      6157
           1       0.72      0.35      0.47      1983

    accuracy                           0.81      8140
   macro avg       0.77      0.65      0.68      8140
weighted avg       0.80      0.81      0.78      8140



# Multinomial Naive Bayes

In [12]:
mnb = MultinomialNB()
mnb.fit(X_train, Y_train)

In [13]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

mnb_pred = mnb.predict(X_test)

print(confusion_matrix(Y_test, mnb_pred))
print(classification_report(Y_test, mnb_pred))

[[6156    1]
 [1941   42]]
              precision    recall  f1-score   support

           0       0.76      1.00      0.86      6157
           1       0.98      0.02      0.04      1983

    accuracy                           0.76      8140
   macro avg       0.87      0.51      0.45      8140
weighted avg       0.81      0.76      0.66      8140



# Bernoulli Naive Bayes

In [14]:
bnb = BernoulliNB()
bnb.fit(X_train, Y_train)

In [15]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

bnb_pred = mnb.predict(X_test)

print(confusion_matrix(Y_test, bnb_pred))
print(classification_report(Y_test, bnb_pred))

[[6156    1]
 [1941   42]]
              precision    recall  f1-score   support

           0       0.76      1.00      0.86      6157
           1       0.98      0.02      0.04      1983

    accuracy                           0.76      8140
   macro avg       0.87      0.51      0.45      8140
weighted avg       0.81      0.76      0.66      8140

