In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv("/nb.csv")

# Separate the features and the target variable
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

class NaiveBayes:
    def __init__(self):
        self.prior = {}
        self.cond_prob = {}
    
    def fit(self, X_train, y_train):
        # Calculate the prior probability of each class
        self.prior["benign"] = np.sum(y_train == "B") / len(y_train)
        self.prior["malignant"] = np.sum(y_train == "M") / len(y_train)
        
        # Calculate the conditional probability of each feature given to each class
        for label in ["benign", "malignant"]:
            label_indices = np.where(y_train == label)[0]
            label_features = X_train.iloc[label_indices, :]
            self.cond_prob[label] = {}
            for feature in label_features.columns:
                unique_vals, counts = np.unique(label_features[feature], return_counts=True)
                prob_dict = dict(zip(unique_vals, counts/np.sum(counts)))
                self.cond_prob[label][feature] = prob_dict
    
    def predict(self, X_test):
        # Predict the class of a given instance using the Naive Bayes algorithm
        predictions = []
        for i in range(len(X_test)):
            x = X_test.iloc[i, :]
            benign_prob = self.prior["benign"]
            malignant_prob = self.prior["malignant"]
            for feature in X_test.columns:
                if x[feature] in self.cond_prob["benign"][feature]:
                    benign_prob *= self.cond_prob["benign"][feature][x[feature]]
                else:
                    benign_prob *= 0
                if x[feature] in self.cond_prob["malignant"][feature]:
                    malignant_prob *= self.cond_prob["malignant"][feature][x[feature]]
                else:
                    malignant_prob *= 0
            if benign_prob > malignant_prob:
                predictions.append("B")
            else:
                predictions.append("M")
        return predictions
    
    def score(self, X_test, y_test):
        # Calculate the accuracy of your Naive Bayes classifier on the testing set
        y_pred = self.predict(X_test)
        accuracy = np.mean(y_pred == y_test)
        cm = confusion_matrix(y_test, y_pred)
        precision = precision_score(y_test, y_pred, pos_label='M')
        recall = recall_score(y_test, y_pred, pos_label='M')
        f1 = f1_score(y_test, y_pred, pos_label='M')
        return accuracy, cm, precision, recall, f1

# Train the model
nb = NaiveBayes()
nb.fit(X_train, y_train)

# Test the model
accuracy, cm, precision, recall, f1 = nb.score(X_test, y_test)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("confusion matrix", cm)

Accuracy: 0.7474382157926461
Precision: 0.7474382157926461
Recall: 1.0
F1 Score: 0.8554674025526042
confusion matrix [[   0 2514]
 [   0 7440]]


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv("/nb.csv")

# Separate the features and the target variable
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Initialize lists to store evaluation metrics for each split
accuracies = []
precisions = []
recalls = []
f1_scores = []

# Make 10 different train-test splits
for i in range(10):
    print("Split", i+1)
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=i)
    
    # Train the model
    nb = NaiveBayes()
    nb.fit(X_train, y_train)

    # Test the model
    accuracy, cm, precision, recall, f1 = nb.score(X_test, y_test)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    
    # Store the evaluation metrics for this split
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Calculate the mean and variance of the evaluation metrics over the 10 splits
mean_accuracy = np.mean(accuracies)
var_accuracy = np.var(accuracies)
mean_precision = np.mean(precisions)
var_precision = np.var(precisions)
mean_recall = np.mean(recalls)
var_recall = np.var(recalls)
mean_f1 = np.mean(f1_scores)
var_f1 = np.var(f1_scores)

# Print the mean and variance of the evaluation metrics
print("Mean accuracy:", mean_accuracy)
print("Variance of accuracy:", var_accuracy)
print("Mean precision:", mean_precision)
print("Variance of precision:", var_precision)
print("Mean recall:", mean_recall)
print("Variance of recall:", var_recall)
print("Mean F1 score:", mean_f1)
print("Variance of F1 score:", var_f1)

Split 1
Accuracy: 0.7512557765722323
Precision: 0.7512557765722323
Recall: 1.0
F1 Score: 0.8579623680587426
Split 2
Accuracy: 0.7507534659433394
Precision: 0.7507534659433394
Recall: 1.0
F1 Score: 0.8576347047684627
Split 3
Accuracy: 0.7470363672895318
Precision: 0.7470363672895318
Recall: 1.0
F1 Score: 0.8552041403105233
Split 4
Accuracy: 0.746031746031746
Precision: 0.746031746031746
Recall: 1.0
F1 Score: 0.8545454545454546
Split 5
Accuracy: 0.7497488446855536
Precision: 0.7497488446855536
Recall: 1.0
F1 Score: 0.8569788138026067
Split 6
Accuracy: 0.7462326702833032
Precision: 0.7462326702833032
Recall: 1.0
F1 Score: 0.8546772523299966
Split 7
Accuracy: 0.7526622463331324
Precision: 0.7526622463331324
Recall: 1.0
F1 Score: 0.8588788260919408
Split 8
Accuracy: 0.7450271247739603
Precision: 0.7450271247739603
Recall: 1.0
F1 Score: 0.8538860103626943
Split 9
Accuracy: 0.7488446855535463
Precision: 0.7488446855535463
Recall: 1.0
F1 Score: 0.8563878676470588
Split 10
Accuracy: 0.750853928