In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


Dataset:
- binary classification
- each tumor is described by a number of features
- target variable: malignant or benign

In [None]:
data=pd.read_csv("data (1).csv")

X = data.drop(columns=['id', 'diagnosis']) # Removing unused classes
y=data['diagnosis'] # Diagnosis class labels

# Split data in training set and testing set (80/20)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
class NaiveBayes:
    def fit(self,X_train,y_train): # Where we train the model
        self.classes=np.unique(y_train) # Returns sorted array of separate class values (B and M)
        # Loop over each class in self.classes
        self.priors=[len(y_train[y_train==c])/len(y_train) for c in self.classes] # Probability y = class label y

        self.means=[X_train[y_train==c].mean() for c in self.classes] # Output: list of panda series
        self.stds=[X_train[y_train==c].std() for c in self.classes]
    def compute_likelihood(self,row,class_idx): # Likelihood a row of input features belonds to a class
        # Takes in one row
        likelihood=1
        for feature in row.index:
            mean=self.means[class_idx][feature] # Mean for that class
            std=self.stds[class_idx][feature] # Standard deviation for that class
            likelihood *= (1 / (np.sqrt(2 * np.pi) * std)) * np.exp((-(row[feature] - mean)**2) / (2 * std**2)) # Assume eaxh feature is independent
        return likelihood
    def predict(self, X): # Take in examples and predict y
        y_pred = []
        for _, row in X.iterrows():
            posteriors = []
            for i in range(len(self.classes)):
                likelihood = self.compute_likelihood(row, i)
                posteriors.append(likelihood * self.priors[i])
            
            y_pred.append(self.classes[np.argmax(posteriors)])
        
        return np.array(y_pred)


In [16]:
nb=NaiveBayes()
nb.fit(X_train, y_train)

predictions=nb.predict(X_test)
accuracy = np.mean(predictions == y_test) * 100

print(f"Accuracy:{accuracy:.2f}%")

Accuracy:62.28%
