Krantas Konstantinos, 9975  
Strikos Konstantinos, 9517

In [60]:
import numpy as np
import pandas as pd
df = pd.read_csv("dataset.csv")
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_decision_regions
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC


# X = df.iloc[:, :2]
# y = df.iloc[:, 2]
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# **1st Question - Bayes Classifier**

# Bayes Classifier for Different Covariance matrix

In [None]:
class DiffCovMatBayesClassifier:
    def __init__(self):
        self.class_priors = {}
        self.class_means = {}
        self.class_covs = {}

    def fit(self, X, y):
        classes = np.unique(y)

        for label in classes:
            # Calculate the a priori probability of each class
            self.class_priors[label] = np.sum(y == label) / len(y)

            # Calculate the mean of each class
            self.class_means[label] = np.mean(X[y == label], axis=0)

            # Calculate the covariance matrix for each class
            self.class_covs[label] = np.cov(X[y == label], rowvar=False)

    def predict(self, X):
        predictions = []
        for x in X:
            posteriors = []
            for label, prior in self.class_priors.items():
                # Calculate the likelihood of class' according to Bayes' theorem
                likelihood = self.multivariate_normal_pdf(x, self.class_means[label], self.class_covs[label])
                posterior = prior * likelihood
                posteriors.append(posterior)

            # Choosing the class with the highest probability
            prediction = list(self.class_priors.keys())[np.argmax(posteriors)]
            predictions.append(prediction)
        return np.array(predictions)

    def multivariate_normal_pdf(self, x, mean, cov):
        # Calculate the probability density for normal distribution
        constant = 1 / ((2 * np.pi) ** (len(x) / 2) * np.linalg.det(cov) ** 0.5)
        exponent = -0.5 * np.dot(np.dot((x - mean).T, np.linalg.inv(cov)), (x - mean))
        return constant * np.exp(exponent)

# Create and train the Bayes classifier
bayes_classifier = DiffCovMatBayesClassifier()
bayes_classifier.fit(X_train, y_train)

y_pred = bayes_classifier.predict(X_test)

# Calculate the error
accuracy = np.mean(y_pred == y_test)
print(f"error: {1 - accuracy}")

# Visualisation for Different Covariance matrix

In [None]:
# Classification Regions

plot_decision_regions(X_test, y_pred, clf=bayes_classifier, legend=2)
plt.savefig("bayes_diffcovmat_regions.png")
plt.show()

# Plot test set points
plt.scatter(
    X_test[:,0],
    X_test[:,1],
    c=y_pred,
    cmap=plt.cm.Paired,
    marker='o',
    edgecolors='k',
    label='Test Set'
)

# Find misclassified points
misclassified_points = X_test[y_pred != y_test]

# Plot misclassified points
plt.scatter(
    misclassified_points[:,0],
    misclassified_points[:,1],
    s=100,
    facecolors='r',
    marker='x',
    label='Misclassified Points'
)
plt.title('Test set visualisation with misclassified points for Bayes with different covariance matrix')
plt.legend()
plt.savefig("bayes_diffcovmat.png")
plt.show()

# Bayes Classifier for Shared Covariance Matrix

In [None]:
class SharedCovMatBayesClassifier:
    def __init__(self):
        self.class_priors = {}
        self.class_means = {}
        self.shared_cov = None

    def fit(self, X, y):
        classes = np.unique(y)

        # Calculate the shared covariance matrix
        self.shared_cov = np.cov(X, rowvar=False)

        for label in classes:
            # Calculate the class prior
            self.class_priors[label] = np.sum(y == label) / len(y)

            # Calculate the class mean
            self.class_means[label] = np.mean(X[y == label], axis=0)

    def predict(self, X):
        predictions = []
        for x in X:
            posteriors = []
            for label, prior in self.class_priors.items():
                # Calculate the likelihood using the shared covariance matrix
                likelihood = self.multivariate_normal_pdf(x, self.class_means[label], self.shared_cov)
                posterior = prior * likelihood
                posteriors.append(posterior)

            # Select the class with the highest probability
            prediction = list(self.class_priors.keys())[np.argmax(posteriors)]
            predictions.append(prediction)
        return np.array(predictions)

    def multivariate_normal_pdf(self, x, mean, cov):
        # Calculate the probability density function for a multivariate normal distribution
        constant = 1 / ((2 * np.pi) ** (len(x) / 2) * np.linalg.det(cov) ** 0.5)
        exponent = -0.5 * np.dot(np.dot((x - mean).T, np.linalg.inv(cov)), (x - mean))
        return constant * np.exp(exponent)

# Create and train the Bayes classifier
bayes_classifier = SharedCovMatBayesClassifier()
bayes_classifier.fit(X_train, y_train)

y_pred = bayes_classifier.predict(X_test)

# Calculate the error
accuracy = np.mean(y_pred == y_test)
print(f"Error: {1 - accuracy}")

# Visualisation for shared covariance matrix

In [None]:
# Classification Regions
plot_decision_regions(X_test, y_pred, clf=bayes_classifier, legend=2)
plt.savefig("bayes_sharedcovmat_regions.png")
plt.show()

# Plot test set points
plt.scatter(
    X_test[:,0],
    X_test[:,1],
    c=y_pred,
    cmap=plt.cm.Paired,
    marker='o',
    edgecolors='k',
    label='Test Set'
)


# Find misclassified points
misclassified_points = X_test[y_pred != y_test]

#Plot misclassified points
plt.scatter(
    misclassified_points[:,0],
    misclassified_points[:,1],
    s=100,
    facecolors='r',
    marker='x',
    label='Misclassified Points'
)
plt.title('Test set visualisation with misclassified points for Bayes with shared covariance matrix')
plt.legend()
plt.savefig("bayes_sharedcovmat.png")
plt.show()

Experimenting on both cases we conclude that the Bayes Classifier is more effective when the covariance matrix is not the shared for all classes, since the error is 0.17142857142857137 for different matrices and 0.26428571428571423 for shared. This is expected because when we force shared covariance matrix, if the features are not totally independent, this assumption may lead to inaccuracies because it is limiting the problem

# **2nd Question - k-NN Classifier**

In [None]:
error = []
for k in range(1,11):
  neigh = KNeighborsClassifier(k)
  neigh.fit(X_train, y_train)
  pred = neigh.predict(X_test)
  accuracy = accuracy_score(y_test, pred)
  current_error = 1-accuracy
  print (f"The error for {k}-nn is: {current_error}\n")
  error.append(current_error)
  # Creating df for the labels
  class_1_df = X_test[pred == 1]
  class_2_df = X_test[pred == 2]
  class_3_df = X_test[pred == 3]

  plt.figure()
  plot_decision_regions(X_test, pred, clf=neigh, legend=2)

  plt.title(f'Decision Regions for k={k}', loc = 'left')
  plt.legend()
  plt.savefig(f'{k}-NN.png')
  plt.show()

plt.figure()
plt.plot(range(1, 11), error, color='blue', linestyle="dotted", marker="o", markerfacecolor='red')
plt.ylabel('Error')
plt.title('Error for k=1 to 10')
plt.savefig("kNN_errors.png")
plt.show()


The error for k = 5, 6, 7 & 8 remains fixed, despite the fact that decision regions do not remain the same for each k.
We also see that the error for those k is lower than the error in both Bayes Classifiers. This seems normal, because:  
1.   We used the stock k-NN function, whereas before we created our own.
2.   Our dataset is also very small, so the difference between Bayes and k-NN on big data is not easily visible.


# **3rd Question - SVM**

In [None]:
# Testing the values of C and gamma to find the optimal
Cs = [0.1,1,10]
gammas = [0.2,2,20]

# Initialize an array to store error values
errors = np.zeros((len(gammas), len(Cs)))

for i, gamma in enumerate(gammas):
    for j, C in enumerate(Cs):
        # Train an SVM with the RBF kernel, the current gamma, and C values
        clf = svm.SVC(kernel='rbf', gamma=gamma, C=C)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        error = 1 - accuracy_score(y_test, y_pred)
        errors[i, j] = error
        print(f'The error for gamma={gamma} and C={C} is: {error}\n')

fig, ax1 = plt.subplots(figsize=(10, 6))

# Plot error values
for j, C in enumerate(Cs):
    ax1.plot(gammas, errors[:, j], label=f'Error (C={C})', marker='x', linestyle='--')

ax1.set_xlabel('Gamma')
ax1.set_ylabel('Error')
ax1.legend(loc='upper left')
ax2 = ax1.twinx()
ax2.set_ylabel('Error', color='red')
plt.title('Error for Different Gamma and C Values with RBF Kernel')
plt.show()
# It is visible that the best accuracy is for C = 1 & gamma = 0.2

In [None]:
C_test = 1
gamma_test = 0.2
svm_model = SVC(kernel='rbf', C=C_test, gamma=gamma_test)
svm_model.fit(X_train, y_train)
predictions = svm_model.predict(X_test)

# Plot decision regions
plt.figure()
plt.title(f'SVM Decision Regions for C={C_test}, gamma={gamma_test}')
plot_decision_regions(X, y, clf=svm_model, legend=2)
plt.savefig("svmregions.png")
plt.show()

# Plot train and test set
plt.figure()
plt.scatter(
    X_train[:,0],
    X_train[:,1],
    c=y_train,
    cmap=plt.cm.Paired,
    marker='o',
    edgecolors='k',
    label='Train Set'
)
plt.scatter(
    X_test[:,0],
    X_test[:,1],
    c=predictions,
    cmap=plt.cm.Paired,
    marker='^',
    edgecolors='k',
    label='Test Set'
)
plt.legend()
plt.title("Train and test set")
plt.savefig("SVM_train_test.png")
plt.show()

# Plot support vectors and train set
plt.figure()
plt.scatter(
    X_train[:,0],
    X_train[:,1],
    c=y_train,
    cmap=plt.cm.Paired,
    marker='o',
    edgecolors='k',
    label='Train Set'
)
plt.scatter(
    svm_model.support_vectors_[:, 0],
    svm_model.support_vectors_[:, 1],
    s=80,
    facecolors='k',
    marker='x',
    label='Support Vectors'
)
plt.legend()
plt.title("Support vectors and train set")
plt.savefig("SVM_train_SV.png")
plt.show()

# Plot test set with misclassified points

# Find misclassified points
misclassified_points = X_test[predictions != y_test]

plt.scatter(
    X_test[:,0],
    X_test[:,1],
    c=predictions,
    cmap=plt.cm.Paired,
    marker='o',
    edgecolors='k',
    label='Test Set'
)
#Plot misclassified points
plt.scatter(
    misclassified_points[:,0],
    misclassified_points[:,1],
    s=80,
    facecolors='r',
    marker='x',
    label='Misclassified Points'
)
plt.legend()
plt.savefig("SVM_test_misclassified.png")
plt.show()

There is a variety of results when we modify the hyperparameters C and gamma, whose optimal combination depends on each dataset. At our example, we experimented with different values and concluded that C=1 and gamma=0.2 are the best choices, in order to achieve the highest accuracy possible.  
Comparing with previous classifiers, it is safe to say that a SVM Classifier, properly modified is way better than a Bayes Classifier. When it comes to comparing with k-NN it is not so clear, since the difference of their accuracies is not very high, so in our case it seems that they are very close and SVM is slightly less accurate.