In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

data = pd.read_csv('./breast-cancer-wisconsin.csv')
print(data.info())
print(data.head())

# Replace missing with the median 
data = data.replace('?', np.NaN)
data = data.fillna(data.median()).astype(np.int64)
# Drop 'Sample code number'
data = data.drop('Sample code number', axis=1)

# Split into training and testing sets
X = data.iloc[:, : -1]
y = data['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Sample code number           699 non-null    int64 
 1   Clump Thickness              699 non-null    int64 
 2   Uniformity of Cell Size      699 non-null    int64 
 3   Uniformity of Cell Shape     699 non-null    int64 
 4   Marginal Adhesion            699 non-null    int64 
 5   Single Epithelial Cell Size  699 non-null    int64 
 6   Bare Nuclei                  699 non-null    object
 7   Bland Chromatin              699 non-null    int64 
 8   Normal Nucleoli              699 non-null    int64 
 9   Mitoses                      699 non-null    int64 
 10  Class                        699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB
None
   Sample code number  Clump Thickness  Uniformity of Cell Size  \
0             1000025         

In [2]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score

# Gaussian Naive Bayes
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)

# Multinomial Naive Bayes
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_pred_mnb = mnb.predict(X_test)

# Bernoulli Naive Bayes
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
y_pred_bnb = bnb.predict(X_test)

# Accuracy_score
print('Accuracy score: ')
print('Gaussian Naive Bayes: ', accuracy_score(y_test,y_pred_gnb))
print('Multinomial Naive Bayes: ', accuracy_score(y_test,y_pred_mnb))
print('Bernoulli Naive Bayes: ', accuracy_score(y_test,y_pred_bnb))

Accuracy score: 
Gaussian Naive Bayes:  0.9428571428571428
Multinomial Naive Bayes:  0.8571428571428571
Bernoulli Naive Bayes:  0.6285714285714286


In [3]:
from sklearn.model_selection import cross_val_score
import statistics as stat

# Cross validation
gnb_cross = cross_val_score(gnb, X, y, cv=10, scoring='accuracy')
mnb_cross = cross_val_score(mnb, X, y, cv=10, scoring='accuracy')
bnb_cross = cross_val_score(bnb, X, y, cv=10, scoring='accuracy')

# Mean scores
print('Mean scores through 10-fold cross validation: ')
print('Gaussian Naive Bayes: ', stat.mean(gnb_cross))
print('Multinomial Naive Bayes: ', stat.mean(mnb_cross))
print('Bernoulli Naive Bayes: ', stat.mean(bnb_cross))

Mean scores through 10-fold cross validation: 
Gaussian Naive Bayes:  0.9599585921325052
Multinomial Naive Bayes:  0.8970807453416149
Bernoulli Naive Bayes:  0.6552173913043479
