In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv("/nb.csv")

# Separate the features and the target variable
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

class NaiveBayes:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.prior = {}
        self.cond_prob = {}
    
    def fit(self, X_train, y_train):
        # Calculate the prior probability of each class
        self.prior["benign"] = np.sum(y_train == "B") / len(y_train)
        self.prior["malignant"] = np.sum(y_train == "M") / len(y_train)
        
        # Calculate the conditional probability of each feature given to each class
        for label in ["benign", "malignant"]:
            label_indices = np.where(y_train == label)[0]
            label_features = X_train.iloc[label_indices, :]
            self.cond_prob[label] = {}
            for feature in label_features.columns:
                unique_vals, counts = np.unique(label_features[feature], return_counts=True)
                prob_dict = dict(zip(unique_vals, (counts + self.alpha) / (np.sum(counts) + self.alpha * len(unique_vals))))
                self.cond_prob[label][feature] = prob_dict
    
    def predict(self, X_test):
        # Predict the class of a given instance using the Naive Bayes algorithm
        predictions = []
        for i in range(len(X_test)):
            x = X_test.iloc[i, :]
            benign_prob = self.prior["benign"]
            malignant_prob = self.prior["malignant"]
            for feature in X_test.columns:
                if x[feature] in self.cond_prob["benign"][feature]:
                    benign_prob *= self.cond_prob["benign"][feature][x[feature]]
                else:
                    benign_prob *= self.alpha / (np.sum(list(self.cond_prob["benign"][feature].values())) + self.alpha * len(self.cond_prob["benign"][feature]))
                if x[feature] in self.cond_prob["malignant"][feature]:
                    malignant_prob *= self.cond_prob["malignant"][feature][x[feature]]
                else:
                    malignant_prob *= self.alpha / (np.sum(list(self.cond_prob["malignant"][feature].values())) + self.alpha * len(self.cond_prob["malignant"][feature]))
            if benign_prob > malignant_prob:
                predictions.append("B")
            else:
                predictions.append("M")
        return predictions
    
    def score(self, X_test, y_test):
        # Calculate the accuracy of your Naive Bayes classifier on the testing set
        y_pred = self.predict(X_test)
        accuracy = np.mean(y_pred == y_test)
        cm = confusion_matrix(y_test, y_pred)
        precision = precision_score(y_test, y_pred, pos_label='M')
        recall = recall_score(y_test, y_pred, pos_label='M')
        f1 = f1_score(y_test, y_pred, pos_label='M')
        return accuracy, cm, precision, recall, f1

# Train the model
nb = NaiveBayes()
nb.fit(X_train, y_train)

# Test the model
accuracy, cm, precision, recall, f1 = nb.score(X_test, y_test)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("confusion matrix", cm)

  benign_prob *= self.alpha / (np.sum(list(self.cond_prob["benign"][feature].values())) + self.alpha * len(self.cond_prob["benign"][feature]))
  malignant_prob *= self.alpha / (np.sum(list(self.cond_prob["malignant"][feature].values())) + self.alpha * len(self.cond_prob["malignant"][feature]))


Accuracy: 0.7474382157926461
Precision: 0.7474382157926461
Recall: 1.0
F1 Score: 0.8554674025526042
confusion matrix [[   0 2514]
 [   0 7440]]


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv("/nb.csv")

accuracies = []
precisions = []
recalls = []
f1_scores = []

# Run the model 10 times with different train-test splits
for i in range(10):
    # Separate the features and the target variable
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]

    # Split the dataset into training and testing sets
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=i)

    # Train the model
    nb = NaiveBayes()
    nb.fit(X_train, y_train)

    # Test the model
    accuracy, cm, precision, recall, f1 = nb.score(X_test, y_test)
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)
    
    print(f"Split {i+1}")
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("confusion matrix", cm)

# Print the average metrics across all 10 splits
print("Average metrics across all 10 splits:")
print("Accuracy:", np.mean(accuracies))
print("Precision:", np.mean(precisions))
print("Recall:", np.mean(recalls))
print("F1 Score:", np.mean(f1_scores))


  benign_prob *= self.alpha / (np.sum(list(self.cond_prob["benign"][feature].values())) + self.alpha * len(self.cond_prob["benign"][feature]))
  malignant_prob *= self.alpha / (np.sum(list(self.cond_prob["malignant"][feature].values())) + self.alpha * len(self.cond_prob["malignant"][feature]))


Split 1
Accuracy: 0.7512557765722323
Precision: 0.7512557765722323
Recall: 1.0
F1 Score: 0.8579623680587426
confusion matrix [[   0 2476]
 [   0 7478]]


  benign_prob *= self.alpha / (np.sum(list(self.cond_prob["benign"][feature].values())) + self.alpha * len(self.cond_prob["benign"][feature]))
  malignant_prob *= self.alpha / (np.sum(list(self.cond_prob["malignant"][feature].values())) + self.alpha * len(self.cond_prob["malignant"][feature]))


Split 2
Accuracy: 0.7507534659433394
Precision: 0.7507534659433394
Recall: 1.0
F1 Score: 0.8576347047684627
confusion matrix [[   0 2481]
 [   0 7473]]


  benign_prob *= self.alpha / (np.sum(list(self.cond_prob["benign"][feature].values())) + self.alpha * len(self.cond_prob["benign"][feature]))
  malignant_prob *= self.alpha / (np.sum(list(self.cond_prob["malignant"][feature].values())) + self.alpha * len(self.cond_prob["malignant"][feature]))


Split 3
Accuracy: 0.7470363672895318
Precision: 0.7470363672895318
Recall: 1.0
F1 Score: 0.8552041403105233
confusion matrix [[   0 2518]
 [   0 7436]]


  benign_prob *= self.alpha / (np.sum(list(self.cond_prob["benign"][feature].values())) + self.alpha * len(self.cond_prob["benign"][feature]))
  malignant_prob *= self.alpha / (np.sum(list(self.cond_prob["malignant"][feature].values())) + self.alpha * len(self.cond_prob["malignant"][feature]))


Split 4
Accuracy: 0.746031746031746
Precision: 0.746031746031746
Recall: 1.0
F1 Score: 0.8545454545454546
confusion matrix [[   0 2528]
 [   0 7426]]


  benign_prob *= self.alpha / (np.sum(list(self.cond_prob["benign"][feature].values())) + self.alpha * len(self.cond_prob["benign"][feature]))
  malignant_prob *= self.alpha / (np.sum(list(self.cond_prob["malignant"][feature].values())) + self.alpha * len(self.cond_prob["malignant"][feature]))


Split 5
Accuracy: 0.7497488446855536
Precision: 0.7497488446855536
Recall: 1.0
F1 Score: 0.8569788138026067
confusion matrix [[   0 2491]
 [   0 7463]]


  benign_prob *= self.alpha / (np.sum(list(self.cond_prob["benign"][feature].values())) + self.alpha * len(self.cond_prob["benign"][feature]))
  malignant_prob *= self.alpha / (np.sum(list(self.cond_prob["malignant"][feature].values())) + self.alpha * len(self.cond_prob["malignant"][feature]))


Split 6
Accuracy: 0.7462326702833032
Precision: 0.7462326702833032
Recall: 1.0
F1 Score: 0.8546772523299966
confusion matrix [[   0 2526]
 [   0 7428]]


  benign_prob *= self.alpha / (np.sum(list(self.cond_prob["benign"][feature].values())) + self.alpha * len(self.cond_prob["benign"][feature]))
  malignant_prob *= self.alpha / (np.sum(list(self.cond_prob["malignant"][feature].values())) + self.alpha * len(self.cond_prob["malignant"][feature]))


Split 7
Accuracy: 0.7526622463331324
Precision: 0.7526622463331324
Recall: 1.0
F1 Score: 0.8588788260919408
confusion matrix [[   0 2462]
 [   0 7492]]


  benign_prob *= self.alpha / (np.sum(list(self.cond_prob["benign"][feature].values())) + self.alpha * len(self.cond_prob["benign"][feature]))
  malignant_prob *= self.alpha / (np.sum(list(self.cond_prob["malignant"][feature].values())) + self.alpha * len(self.cond_prob["malignant"][feature]))


Split 8
Accuracy: 0.7450271247739603
Precision: 0.7450271247739603
Recall: 1.0
F1 Score: 0.8538860103626943
confusion matrix [[   0 2538]
 [   0 7416]]


  benign_prob *= self.alpha / (np.sum(list(self.cond_prob["benign"][feature].values())) + self.alpha * len(self.cond_prob["benign"][feature]))
  malignant_prob *= self.alpha / (np.sum(list(self.cond_prob["malignant"][feature].values())) + self.alpha * len(self.cond_prob["malignant"][feature]))


Split 9
Accuracy: 0.7488446855535463
Precision: 0.7488446855535463
Recall: 1.0
F1 Score: 0.8563878676470588
confusion matrix [[   0 2500]
 [   0 7454]]


  benign_prob *= self.alpha / (np.sum(list(self.cond_prob["benign"][feature].values())) + self.alpha * len(self.cond_prob["benign"][feature]))
  malignant_prob *= self.alpha / (np.sum(list(self.cond_prob["malignant"][feature].values())) + self.alpha * len(self.cond_prob["malignant"][feature]))


Split 10
Accuracy: 0.750853928069118
Precision: 0.750853928069118
Recall: 1.0
F1 Score: 0.8577002524672941
confusion matrix [[   0 2480]
 [   0 7474]]
Average metrics across all 10 splits:
Accuracy: 0.7488446855535462
Precision: 0.7488446855535462
Recall: 1.0
F1 Score: 0.8563855690384774
