In [16]:
import csv
import urllib.request
import random
import pandas as pd

In [26]:
data_file = "iris.tmls"
iris_df = pd.read_csv(data_file)
#  drop first row
iris_df = iris_df.drop(iris_df.index[0]).reset_index(drop=True)
# Convert all columns except the last one (class column) to float
iris_df.iloc[:, :-1] = iris_df.iloc[:, :-1].apply(lambda x: x.astype(float))
iris_df.head()

iris_data_v1 = iris_df[iris_df["class"] != 'Iris-setosa']  # Remove 'Iris-setosa' class
iris_data_v2 = iris_df[iris_df["class"] != 'Iris-versicolor']  # Remove 'Iris-versicolor' class
iris_data_v3 = iris_df[iris_df["class"] != 'Iris-virginica']  # Remove 'Iris-virginica' class



In [18]:
def train_test_split(data, test_ratio):
    random.shuffle(data)
    test_size = int(len(data) * test_ratio)
    test_set = data[:test_size]
    train_set = data[test_size:]
    return train_set, test_set


In [19]:
class NaiveBayesClassifier:
    def __init__(self):
        self.class_probs = {}
        self.feature_probs = {}

    def fit(self, X, y):
        # Calculate class probabilities
        classes = set(y)
        for cls in classes:
            self.class_probs[cls] = y.count(cls) / len(y)

        # Calculate feature probabilities
        for cls in classes:
            self.feature_probs[cls] = {}
            cls_indices = [i for i, label in enumerate(y) if label == cls]
            cls_data = [X[i] for i in cls_indices]

            for feature_idx in range(len(X[0])):
                feature_values = [row[feature_idx] for row in cls_data]
                mean = sum(feature_values) / len(feature_values)
                variance = sum([(val - mean) ** 2 for val in feature_values]) / len(feature_values)
                self.feature_probs[cls][feature_idx] = (mean, variance)

    def predict_proba(self, x):
        def gaussian_prob(val, mean, variance):
            return (1 / (variance ** 0.5 * (2 * 3.14159) ** 0.5)) * 2.71828 ** (-0.5 * ((val - mean) ** 2 / variance))

        probabilities = {}
        for cls, cls_prob in self.class_probs.items():
            prob = cls_prob
            for feature_idx, feature_val in enumerate(x):
                mean, variance = self.feature_probs[cls][feature_idx]
                prob *= gaussian_prob(feature_val, mean, variance)
            probabilities[cls] = prob
        return probabilities

    def predict(self, x):
        probabilities = self.predict_proba(x)
        return max(probabilities, key=probabilities.get)


In [20]:
def k_fold_cross_validation(data, k):
    fold_size = len(data) // k
    folds = [data[i * fold_size:(i + 1) * fold_size] for i in range(k)]
    return folds

def roc_auc(y_true, y_scores, positive_class):
    sorted_indices = sorted(range(len(y_scores)), key=lambda i: y_scores[i], reverse=True)
    y_sorted = [y_true[i] for i in sorted_indices]

    tpr = []
    fpr = []
    num_positive = sum([1 for val in y_true if val == positive_class])
    num_negative = len(y_true) - num_positive

    tp = 0
    fp = 0
    for val in y_sorted:
        if val == positive_class:
            tp += 1
        else:
            fp += 1
        tpr.append(tp / num_positive)
        fpr.append(fp / num_negative)

    # Calculate the AUC
    auc = 0
    for i in range(1, len(tpr)):
        auc += (fpr[i] - fpr[i - 1]) * (tpr[i] + tpr[i - 1]) / 2

    return fpr, tpr, auc


In [27]:
k = 10
folds = k_fold_cross_validation(iris_data_v1, k)

auc_scores = []
for i in range(k):
    test_set = folds[i]
    train_set = [row for j, fold in enumerate(folds) if j != i for row in fold]

    X_train = [row[:-1] for row in train_set]
    y_train = [row[-1] for row in train_set]
    X_test = [row[:-1] for row in test_set]
    y_test = [row[-1] for row in test_set]

    # Train and evaluate the classifier
    clf = NaiveBayesClassifier()
    clf.fit(X_train, y_train)
    y_scores = [clf.predict_proba(x)['Iris-versicolor'] for x in X_test]
    fpr, tpr, auc = roc_auc(y_test, y_scores, 'Iris-versicolor')
    auc_scores.append(auc)

average_auc = sum(auc_scores) / len(auc_scores)
print("10-fold cross-validation average AUC:", average_auc)

TypeError: unsupported operand type(s) for +: 'int' and 'str'