In [29]:
# Import Library and load data
import sklearn
import pandas as pd
import numpy as np
import warnings

from sklearn.svm import SVC
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
# ignore all warnings
warnings.filterwarnings('ignore')

iris = sklearn.datasets.load_iris()
x, y = iris.data, iris.target

In [30]:
# Question 2. Split the dataset into a training set and a test
x_train, x_test, y_train, y_test=train_test_split(x,y, test_size=0.2,random_state=12345)

In [23]:
# Question 3.1.Run K-NN on the Iris dataset with K=3 with 
for x in ['manhattan', 'euclidean', 'cosine']:
    knn = KNeighborsClassifier(n_neighbors=3, metric=x)
    knn.fit(x_train, y_train)
    y_pred=knn.predict(x_test)
    print("Confusion Matrix with {} distance metric: \n{}".format(x,confusion_matrix(y_test, y_pred)))
    print("Accuracy with {} distance metric: {:.2f}%\n".format(x,accuracy_score(y_test, y_pred)))

Confusion Matrix with manhattan distance metric: 
[[11  0  0]
 [ 0 11  1]
 [ 0  0  7]]
Accuracy with manhattan distance metric: 0.97%

Confusion Matrix with euclidean distance metric: 
[[11  0  0]
 [ 0 11  1]
 [ 0  0  7]]
Accuracy with euclidean distance metric: 0.97%

Confusion Matrix with cosine distance metric: 
[[11  0  0]
 [ 0  9  3]
 [ 0  0  7]]
Accuracy with cosine distance metric: 0.90%



In [24]:
# Question 3.1.Run K-NN on the Iris dataset with K=3 with KFold
kf = KFold(n_splits=5, shuffle=True, random_state=12345)
for m, n in kf.split(iris.data):
    x_train, x_test = iris.data[m], iris.data[n]
    y_train, y_test = iris.target[m], iris.target[n]
for x in ['manhattan', 'euclidean', 'cosine']:
    knn = KNeighborsClassifier(n_neighbors=3, metric=x)
    knn.fit(x_train, y_train)
    y_pred=knn.predict(x_test)
    print("Confusion Matrix with {} distance metric: \n{}".format(x,confusion_matrix(y_test, y_pred)))
    print("Accuracy with {} distance metric: {:.2f}%\n".format(x,accuracy_score(y_test, y_pred)))

Confusion Matrix with manhattan distance metric: 
[[10  0  0]
 [ 0  8  0]
 [ 0  2 10]]
Accuracy with manhattan distance metric: 0.93%

Confusion Matrix with euclidean distance metric: 
[[10  0  0]
 [ 0  8  0]
 [ 0  1 11]]
Accuracy with euclidean distance metric: 0.97%

Confusion Matrix with cosine distance metric: 
[[10  0  0]
 [ 0  8  0]
 [ 0  0 12]]
Accuracy with cosine distance metric: 1.00%



In [25]:
# Question 3.2. Find the best number of neighbours using k-fold cross validation
k_values = [1, 2, 3,4,5,6,7,8,9,10]
best_acc = 0
best_k = 0
for k in [1,2,3,4,5,6,7,8,9,10]:
    knn = KNeighborsClassifier(n_neighbors=k, metric='cosine')
    scores = cross_val_score(knn, iris.data, iris.target, cv=5)
    accuracy = scores.mean()
    print("K = %d, accuracy = %.3f" % (k, accuracy))
    if accuracy > best_acc:
        best_acc = accuracy
        best_k = k

print("Best number of neighbors: {}".format(best_k))
print("Final accuracy: {:.2f}%".format(best_acc * 100)) 

K = 1, accuracy = 0.960
K = 2, accuracy = 0.953
K = 3, accuracy = 0.980
K = 4, accuracy = 0.980
K = 5, accuracy = 0.980
K = 6, accuracy = 0.973
K = 7, accuracy = 0.960
K = 8, accuracy = 0.973
K = 9, accuracy = 0.967
K = 10, accuracy = 0.967
Best number of neighbors: 3
Final accuracy: 98.00%


In [32]:
# Question 4.1.train SVM's with different hyperparameter
# Split the dataset into a training set and a test
#x_train, x_test, y_train, y_test=train_test_split(x,y, test_size=0.2,random_state=12345)
#for x in [0.001, 0.1, 1]     # for Linear Kernel
for x in [0.001, 0.1]:        # for Poly Kernel
    #model = SVC(C=x, kernel='linear')    # for Linear Kernel
    model = SVC(C=x, kernel='poly', degree=2)    # for Poly Kernel
    kf = KFold(n_splits=10, shuffle=True, random_state=12345)
    cv_scores = []
    cv_train_losses = []
    cv_train2_losses = []
    cv_val_losses = []
    cv_test_losses = []
    for m, n in kf.split(x_train):
        x_train_fold, x_val_fold = x_train[m], x_train[n]
        y_train_fold, y_val_fold = y_train[m], y_train[n]
        model.fit(x_train_fold, y_train_fold)
        # predict and loss for train and validation in train data
        y_pred_train = model.predict(x_train_fold)
        loss_train = mean_squared_error(y_train_fold, y_pred_train)
        cv_train_losses.append(loss_train)
        y_pred_val = model.predict(x_val_fold)
        loss_val = mean_squared_error(y_val_fold, y_pred_val)
        cv_val_losses.append(loss_val)
        # predict and loss for train -The sum of Train and validation on the Kfold
        y_pred_train2 = model.predict(x_train)
        loss_train2 = mean_squared_error(y_train, y_pred_train2)
        cv_train2_losses.append(loss_train2)
        # Accuracy and MSE for test data set
        y_pred_test = model.predict(x_test)
        loss_test = mean_squared_error(y_test, y_pred_test)
        cv_test_losses.append(loss_test)
        acc = accuracy_score(y_test, y_pred_test)
        cv_scores.append(acc)
    # Compute and print the mean and the cross-validation scores
    mean_train_loss = sum(cv_train_losses) / len(cv_train_losses)
    mean_val_loss = sum(cv_val_losses) / len(cv_val_losses)
    mean_train2_loss = sum(cv_train2_losses) / len(cv_train2_losses)
    mean_test_loss = sum(cv_test_losses) / len(cv_test_losses)
    mean_cv_score = sum(cv_scores) / len(cv_scores)
    print("with polynomial kernel C={}, Mean Sub-Train Loss on Train Data is: {} ".format(x, mean_train_loss))
    print("with polynomial kernel C={}, Mean Validation Loss on Train Data is: {}".format(x, mean_val_loss))
    print("with polynomial kernel C={}, Mean Train Loss on Train Data is: {}".format(x, mean_train2_loss))
    print("with polynomial kernel C={}, Mean Test Loss on Test Data is: {}".format(x, mean_test_loss))
    print("with polynomial kernel C={}, Mean accuracy on Test Data is: {}".format(x, mean_cv_score))

with polynomial kernel C=0.001, Mean Sub-Train Loss on Train Data is: 1.3629629629629627 
with polynomial kernel C=0.001, Mean Validation Loss on Train Data is: 1.5
with polynomial kernel C=0.001, Mean Train Loss on Train Data is: 1.376666666666667
with polynomial kernel C=0.001, Mean Test Loss on Test Data is: 1.56
with polynomial kernel C=0.001, Mean accuracy on Test Data is: 0.32
with polynomial kernel C=0.1, Mean Sub-Train Loss on Train Data is: 0.05185185185185185 
with polynomial kernel C=0.1, Mean Validation Loss on Train Data is: 0.041666666666666664
with polynomial kernel C=0.1, Mean Train Loss on Train Data is: 0.05083333333333334
with polynomial kernel C=0.1, Mean Test Loss on Test Data is: 0.006666666666666666
with polynomial kernel C=0.1, Mean accuracy on Test Data is: 0.9933333333333334


In [27]:
#Question 5.1.train simplest baselines, i.e., random classification and majority classification
# Random classification with K-fold
dummy_clf_random = DummyClassifier(strategy="uniform")
scores_random = cross_val_score(dummy_clf_random, iris.data, iris.target, cv=5)
accuracy_random = np.mean(scores_random)

# Majority classification with K-fold
dummy_clf_majority = DummyClassifier(strategy="most_frequent")
scores_majority = cross_val_score(dummy_clf_majority, iris.data, iris.target, cv=5)
accuracy_majority = np.mean(scores_majority)

print("Accuracy of random classification: {}".format(accuracy_random))
print("Accuracy of majority classification: {}".format(accuracy_majority))

Accuracy of random classification: 0.36
Accuracy of majority classification: 0.3333333333333333
