Naive Bayesian

Question 1

In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

data = pd.read_csv("car_evaluation.csv")

data['decision'] = data['decision'].astype('category').cat.codes
X = data.drop(columns=['decision'])
y = data['decision']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


class NaiveBayesClassifier:
    def __init__(self):
        self.class_probabilities = {}
        self.feature_probabilities = {}

    def calculate_prior_probabilities(self, y_train):
        unique_classes, class_counts = np.unique(y_train, return_counts=True)
        total_samples = len(y_train)
        self.class_probabilities = dict(zip(unique_classes, class_counts / total_samples))
        print(self.class_probabilities)
        
  

    def calculate_feature_probabilities(self, X_train, y_train):
        num_samples, num_features = X_train.shape
        unique_classes = np.unique(y_train)
        self.feature_probabilities = {}

        for class_label in unique_classes:
            class_indices = np.where(y_train == class_label)
            class_samples = X_train.iloc[class_indices]
            self.feature_probabilities[class_label] = {}

            for feature in X_train.columns:
                feature_probabilities = {}
                for feature_value in X_train[feature].unique():
                    feature_indices = np.where(class_samples[feature] == feature_value)
                    feature_probabilities[feature_value] = len(feature_indices[0]) / len(class_indices[0])
                self.feature_probabilities[class_label][feature] = feature_probabilities
                

    def fit(self, X_train, y_train):
        self.calculate_prior_probabilities(y_train)
        self.calculate_feature_probabilities(X_train, y_train)
        # print(self.calculate_prior_probabilities(y_train))
        # print(self.calculate_feature_probabilities(X_train, y_train))

    def predict_sample(self, sample):
        class_scores = {}
        for class_label, class_prob in self.class_probabilities.items():
            score = np.log(class_prob)
            for feature, feature_value in sample.items():
                feature_probability = self.feature_probabilities[class_label][feature].get(feature_value, 1e-6)
                score += np.log(feature_probability)
            class_scores[class_label] = score
        return max(class_scores, key=class_scores.get)

    def predict(self, X_test):
        predictions = []

        for _, sample in X_test.iterrows():
            prediction = self.predict_sample(sample)
            predictions.append(prediction)

        return predictions


classifier = NaiveBayesClassifier()
classifier.fit(X_train, y_train)
# print(feature_probabilities)

predictions = classifier.predict(X_test)


accuracy = np.mean(predictions == y_test)
print(f"Accuracy: {accuracy:.3f}")


{0: 0.2178002894356006, 1: 0.041968162083936326, 2: 0.7054992764109985, 3: 0.03473227206946455}
Accuracy: 0.818


  score += np.log(feature_probability)


Question 2

In [26]:
import numpy as np
import pandas as pd

column_names = ["sepal_length", "sepal_width","petal_length", "petal_width", "class"]
data = pd.read_csv("iris.csv", names=column_names)

class_mapping = {"setosa": 0, "versicolor": 1, "virginica": 2}
data["class"] = data["class"].map(class_mapping)

data = data.sample(frac=1, random_state=1).reset_index(drop=True)

X = data.drop("class", axis=1).values
y = data["class"].values

split_ratio = 0.8
split_index = int(len(X) * split_ratio)

X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]



def sigmoid(z):
    return 1 / (1 + np.exp(-z))


def initialize_parameters(num_features, num_classes):
    W = np.zeros((num_features, num_classes))
    b = np.zeros(num_classes)
    return W, b


def forward_propagation(X, W, b):
    Z = np.dot(X, W) + b
    A = sigmoid(Z)
    return A


def compute_cost(Y, A):
    m = Y.shape[0]
    cost = -np.sum(Y * np.log(A + 1e-10)) / m
    return cost


def backward_propagation(X, Y, A):
    m = X.shape[0]
    dZ = A - Y
    dW = np.dot(X.T, dZ) / m
    db = np.sum(dZ, axis=0) / m
    return dW, db


def update_parameters(W, b, dW, db, learning_rate):
    W -= learning_rate * dW
    b -= learning_rate * db
    return W, b


def one_hot_encode(y, num_classes):
    m = len(y)
    one_hot = np.zeros((m, num_classes))
    one_hot[np.arange(m), y] = 1
    return one_hot


def train_logistic_regression(X_train, y_train, num_classes, num_iterations, learning_rate):
    num_features = X_train.shape[1]
    W, b = initialize_parameters(num_features, num_classes)

    for i in range(num_iterations):
        A = forward_propagation(X_train, W, b)
        cost = compute_cost(one_hot_encode(y_train, num_classes), A)
        dW, db = backward_propagation(
            X_train, one_hot_encode(y_train, num_classes), A)
        W, b = update_parameters(W, b, dW, db, learning_rate)

    return W, b


def predict(X, W, b):
    A = forward_propagation(X, W, b)
    return np.argmax(A, axis=1)




def k_fold_cross_validation(X, y, k, num_iterations, learning_rate):
    num_samples = len(X)
    fold_size = num_samples // k
    best_accuracy = 0.0
    best_fold = None

    for fold in range(k):
        start = fold * fold_size
        end = (fold + 1) * fold_size
        X_valid_fold, y_valid_fold = X[start:end], y[start:end]
        X_train_fold = np.concatenate([X[:start], X[end:]])
        y_train_fold = np.concatenate([y[:start], y[end:]])

        W, b = train_logistic_regression(X_train_fold, y_train_fold, num_classes, num_iterations, learning_rate)
        y_pred_fold = predict(X_valid_fold, W, b)
        accuracy = np.mean(y_pred_fold == y_valid_fold)

        print(f"Fold {fold + 1}: Validation Accuracy = {accuracy:.4f}")

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_fold = fold

    return best_fold, best_accuracy



num_classes = 3
num_iterations = 1000
learning_rate = 0.01
k_folds = 5

best_fold, best_accuracy = k_fold_cross_validation(X_train, y_train, k_folds, num_iterations, learning_rate)

print(f"\nBest Fold: {best_fold + 1}")
print(f"Best Validation Accuracy: {best_accuracy:.4f}")


Fold 1: Validation Accuracy = 0.5833
Fold 2: Validation Accuracy = 0.7917
Fold 3: Validation Accuracy = 0.8750
Fold 4: Validation Accuracy = 0.9167
Fold 5: Validation Accuracy = 0.8333

Best Fold: 4
Best Validation Accuracy: 0.9167
