In [0]:
import csv
import pandas as pd
import numpy as np
import nltk
from matplotlib import pyplot as plt
from mlxtend.plotting import plot_decision_regions
from prettytable import PrettyTable
import seaborn as sns

from sklearn.datasets import load_wine
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

#Utilities


In [0]:
def plot_2D_graph(X, y, x_label='x', y_label='y', title='title'):
  num_classes = 3
  fig = plt.figure(figsize=(5, 5))
  palette = np.array(sns.color_palette("hls", num_classes))
  ax = fig.add_subplot(111)
  for c, lab in zip(np.unique(y), y):
    x_c = X[y == c]
    ax.scatter(x_c[:,0], x_c[:,1], lw=0, c=[palette[c]]*len(x_c), label=lab)
    
  plt.xlabel(x_label)
  plt.ylabel(y_label)
  fig.suptitle(title)
  #fig.savefig('fig.png')
  return

In [0]:
def plot_boundaries(X, y, clf, x_label='x', y_label='y', title='title', name=0):
  fig = plt.figure(figsize=(5, 5))
  ax = plot_decision_regions(X, y, clf=clf)
  plt.xlabel(x_label)
  plt.ylabel(y_label)
  plt.title(title)
  handles, labels = ax.get_legend_handles_labels()
  plt.legend(handles, ['class 0', 'class 1', 'class 2'])
  plt.show()
  #fig.savefig(f"fig_{name}.png")
  return

In [0]:
def plot_graph_line(X, y, x_label='x', y_label='y', title='title'):
  fig = plt.figure(figsize=(5, 5))
  plt.plot(X,y)
  plt.xlabel(x_label)
  plt.ylabel(y_label)
  plt.title(title)
  plt.show()
  return

In [0]:
#find the best K
def K_Nearest_Neghbors(X_train, X_val, y_train, y_val, X_label="x", y_label="y"):
  best_clf = ''
  best_score = 0
  score = 0
  scores =[]
  K = [1, 3, 5, 7]
  for k in K:
    clf = KNeighborsClassifier(n_neighbors=k)
    clf = clf.fit(X_train, y_train)
    plot_boundaries(X_train,y_train,clf,X_label,y_label,f'K = {k}')

    y_pred_val = clf.predict(X_val)
    score = accuracy_score(y_val,y_pred_val)
    scores.append(score)

    if score>=best_score:
      best_score = score
      best_clf = clf
  
  plot_graph_line(K,scores,"N° of Nearest Neighbors", "Accuracy", "Accuracy on Validation")
  return best_clf

In [0]:
#find the best C
#use kernel linear 
def SVM_linear(X_train, X_val, y_train, y_val, X_label="x", y_label="y"):
  best_clf = ''
  best_score = 0
  score = 0
  scores =[]
  C = [0.001, 0.01, 0.1, 1, 10, 100,1000]
  for c in C:
    clf = SVC(kernel= "linear", C=c)
    clf = clf.fit(X_train, y_train)
    plot_boundaries(X_train,y_train,clf, X_label, y_label, f'C = {c}', c)

    y_pred_val = clf.predict(X_val)
    score = accuracy_score(y_val,y_pred_val)
    scores.append(score)

    if score>best_score:
      best_score = score
      best_clf = clf
      
  plot_graph_line(C,scores,"Value of C", "Accuracy", "Accuracy on Validation")
  return best_clf

In [0]:
#find the best C
#use RBF kernel
def SVM_RBF(X_train, X_val, y_train, y_val, X_label="x", y_label="y"):
  best_clf = ''
  best_score = 0
  score = 0
  scores =[]
  C = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
  for c in C:
    clf = SVC(kernel= "rbf", C=c)
    clf = clf.fit(X_train, y_train)
    plot_boundaries(X_train, y_train, clf, X_label, y_label, f'C = {c}', c)

    y_pred_val = clf.predict(X_val)
    score = accuracy_score(y_val,y_pred_val)
    scores.append(score)

    if score>best_score:
      best_score = score
      best_clf = clf
 
  plot_graph_line(C,scores,"Value of C", "Accuracy", "Accuracy on Validation")
  return best_clf

In [0]:
def SVM_RBF_gamma(X_train, X_val, y_train, y_val, X_label="x", y_label="y"):
  configs ={"kernel":['rbf'],
            "C":[0.001, 0.01, 0.1, 1, 10, 100, 1000],
            "gamma":[100, 10, 1, 0.1, 1e-2, 1e-3]} 
  best_clf = ''
  best_score = 0
  score = 0
  t = PrettyTable(['C\gamma',100, 10, 1, 0.1, 1e-2, 1e-3])
  row = []
  C=0.001
  row.append(C)
  
  for config in ParameterGrid(configs):
    if config["C"]!=C:
      t.add_row(row)
      row = []
      C = config["C"]
      row.append(C)
      
    clf = SVC(**config)
    clf = clf.fit(X_train, y_train)
    y_pred_val = clf.predict(X_val)
    score = accuracy_score(y_val,y_pred_val)
    row.append(round(score,4)*100)

    if score>best_score:
      best_score = score
      best_clf = clf
  t.add_row(row)
  print(t)
  return best_clf

In [0]:
def SVM_KFold(X, y, X_label="x", y_label="y"):
  configs ={"kernel":['rbf'],
          "C":[0.001, 0.01, 0.1, 1, 10, 100,1000],
          "gamma":[100, 10, 1, 0.1, 1e-2, 1e-3]} 
  best_clf = ''
  best_score = 0
  score = 0
  t = PrettyTable(['C\gamma',100, 10, 1, 0.1, 1e-2, 1e-3])
  row = []
  C=0.001
  row.append(C)

  for config in ParameterGrid(configs):
    if config["C"]!=C:
      t.add_row(row)
      row = []
      C = config["C"]
      row.append(C)

    clf = SVC(**config)
    score = cross_val_score(clf, X, y, cv=5)
    
    score = score.mean()
    row.append(round(score,4)*100)
    if score>best_score:
      best_score = score
      best_clf = clf

  t.add_row(row)
  print(t)
  return best_clf

In [0]:
def plot_feature_pairs(data,feature_names)
  X3 = data[:, 0:2]
  for i in range(len(feature_names)):
    for j in range(i+1, len(feature_names)):
      X3[:,0] = data[:, i]
      X3[:,1] = data[:, j]
      X1 = feature_names[i]
      X2 = feature_names[j]
      title = str(i)+ " "+str(j)
      plot_2D_graph(X3,y, X1,X2, title)

  return

#Execution

In [0]:
#load the dataset
dataset = load_wine() 
data = dataset["data"] #X[0] 13 valori  del vino 0
y = dataset["target"] 
feature_names = dataset["feature_names"]

In [0]:
#statistics on the dataset
n = [0,0,0]
for i in y:
  n[i] += 1
print(f"samples on class 0: {n[0]}")
print(f"samples on class 1: {n[1]}")
print(f"samples on class 2: {n[2]}")


samples on class 0: 59
samples on class 1: 71
samples on class 2: 48


###First 2 components

In [0]:
#select the first 2 attribute
X = data[:,0:2]
X1 = feature_names[0]
X2 = feature_names[1]
plot_2D_graph(X, y, X1, X2, "first 2 features")

In [0]:
 X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=269317)

 X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, stratify=y_train_val, test_size=0.27, random_state=269317) # 50/70 *100 = 71,43

In [0]:
#check of the partition
n = [0,0,0]
for i in y_train:
  n[i] += 1
print("train ser distribution:")
print(f"samples on class 0: {n[0]}")
print(f"samples on class 1: {n[1]}")
print(f"samples on class 2: {n[2]}")

n = [0,0,0]
for i in y_test:
  n[i] += 1
print("test set distribution:")
print(f"samples on class 0: {n[0]}")
print(f"samples on class 1: {n[1]}")
print(f"samples on class 2: {n[2]}")

n = [0,0,0]
for i in y_val:
  n[i] += 1
print("validation set distribution:")
print(f"samples on class 0: {n[0]}")
print(f"samples on class 1: {n[1]}")
print(f"samples on class 2: {n[2]}")


In [0]:
#find the best K
clf = K_Nearest_Neghbors(X_train, X_val, y_train, y_val, X1, X2)
clf = clf.fit(X_train_val, y_train_val)
y_pred_test = clf.predict(X_test)
acc = accuracy_score(y_test,y_pred_test)
print(f"Accuracy score: {acc:.2f} %") 

In [0]:
#find the best C
#use kernel linear 
clf = SVM_linear(X_train, X_val, y_train, y_val, X1, X2)
clf = clf.fit(X_train_val, y_train_val)
y_pred_test = clf.predict(X_test)
acc = accuracy_score(y_test,y_pred_test)
print(f"Accuracy score: {acc:.2f} %")

In [0]:
#find the best C
#use RBF kernel
clf = SVM_RBF(X_train, X_val, y_train, y_val,X1, X2)
clf = clf.fit(X_train_val, y_train_val)
y_pred_test = clf.predict(X_test)
acc = accuracy_score(y_test,y_pred_test)
print(f"Accuracy score: {acc:.2f} %")

In [0]:
#parameter grid to tune both gamma and c
clf = SVM_RBF_gamma(X_train, X_val, y_train, y_val, X1, X2)
clf = clf.fit(X_train_val, y_train_val)
y_pred_test = clf.predict(X_test)
acc = accuracy_score(y_test,y_pred_test)
plot_boundaries(X_train,y_train,clf, X1, X2,'')
print(clf)
print(f"Accuracy score: {acc:.2f} %")

In [0]:
#cross validation for RBF kernel
clf = SVM_KFold(X_train_val, y_train_val, X1, X2)
clf = clf.fit(X_train_val, y_train_val)
y_pred_test = clf.predict(X_test)
acc = accuracy_score(y_test,y_pred_test)
plot_boundaries(X_train_val,y_train_val,clf, X1, X2,'')
print(clf)
print(f"Accuracy score: {acc:.2f} %")

### feature 5 and 9

In [0]:
#select 5 and 9 attribute
X[:,0] = data[:,5]
X[:,1] = data[:,9]
X1 = feature_names[5]
X2 = feature_names[9]
plot_2D_graph(X,y, X1,X2, "")

In [0]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=269317)

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, stratify=y_train_val, test_size=0.3, random_state=269317) # 50/70 *100 = 71,43

In [0]:
#find the best K
clf = K_Nearest_Neghbors(X_train, X_val, y_train, y_val, X1, X2)
clf = clf.fit(X_train_val, y_train_val)
y_pred_test = clf.predict(X_test)
acc = accuracy_score(y_test,y_pred_test)
print(f"Accuracy score: {acc:.2f} %") 
plot_boundaries(X_train,y_train,clf, X1, X2,'K = 3')

In [0]:
#find the best C
#use kernel linear 
clf = SVM_linear(X_train, X_val, y_train, y_val, X1, X2)
clf = clf.fit(X_train_val, y_train_val)
y_pred_test = clf.predict(X_test)
acc = accuracy_score(y_test,y_pred_test)
print(f"Accuracy score: {acc:.2f} %")
plot_boundaries(X_train,y_train,clf, X1, X2,'C = 1')

In [0]:
#find the best C
#use RBF kernel
clf = SVM_RBF(X_train, X_val, y_train, y_val,X1, X2)
clf = clf.fit(X_train_val, y_train_val)
y_pred_test = clf.predict(X_test)
acc = accuracy_score(y_test,y_pred_test)
print(f"Accuracy score: {acc:.2f} %")
plot_boundaries(X_train,y_train,clf, X1, X2,'C = 100')

In [0]:
#parameter grid to tune both gamma and c
clf = SVM_RBF_gamma(X_train, X_val, y_train, y_val, X1, X2)
clf = clf.fit(X_train_val, y_train_val)
y_pred_test = clf.predict(X_test)
acc = accuracy_score(y_test,y_pred_test)
print(f"Accuracy score: {acc:.2f} %")
plot_boundaries(X_train,y_train,clf, X1, X2,'C=100   \u03B3=0,1')

In [0]:
#cross validation for RBF kernel
clf = SVM_KFold(X_train_val, y_train_val, X1, X2)
clf = clf.fit(X_train_val, y_train_val)
y_pred_test = clf.predict(X_test)
acc = accuracy_score(y_test,y_pred_test)
print(f"Accuracy score: {acc:.2f} %")
plot_boundaries(X_train,y_train,clf, X1, X2)