# Support Vector Machine (SVM)

Experimenting with the SVM algorithm with a multidimensional dataset.

In [2]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# auxiliary functions
def load_data(file1: str, file2: str):
    """
    Loads the data from two csv files, with no header, concatenates them 
    and builds the output data, corresponding to the two classes.

    Returns
    -------
    x : np.ndarray
        Input data
    y : np.ndarray
        Output data

    """
    # Load data
    df1 = pd.read_csv(file1, header=None)
    df2 = pd.read_csv(file2, header=None)
    # Concatenate data
    df = pd.concat([df1, df2])
    # Input data
    x = df.to_numpy()
    # Build output data
    y = np.zeros((x.shape[0],))
    y[df1.shape[0]:] = 1
    
    return x, y

In [None]:
from os.path import join
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC

# files with data
root_dir = "data"
data_files = [["EarthSpace.csv", "MedSci.csv"], ["LifeSci.csv", "Agri.csv"] ]

# ser different c values
c_values = [0.01, 0.1, 1.0, 10.0, 100.0]
# set different gamma values
gamma_values = [0.01, 0.1, 1.0, 10.0, 100.0]
# set different polynomial degrees
poly_degrees = [2, 3, 4, 5, 6]
# scaling methods
scaling_methods = ["no scaling", "standard scaling", "min-max scaling"]
# type accuracy values
accuracy_types = ["train", "test"]

# create dataframes to store results
index = pd.MultiIndex.from_product([scaling_methods, c_values], names=["scaling", "c"])
column_index = pd.MultiIndex.from_tuples([("linear", "", "train"), ("linear", "", "test"), 
                                          *[("rbf", gamma, accuracy_type) for gamma in gamma_values for accuracy_type in accuracy_types ],
                                          *[("poly", degree, accuracy_type) for degree in poly_degrees for accuracy_type in accuracy_types ]],
                                        names=["kernel", "gamma/degree", "train/test"])
df = pd.DataFrame(index=index, columns=column_index)

for data_file in data_files:
    # load data
    x, y = load_data(join(root_dir, data_file[0]), join(root_dir, data_file[1]))
    # split data into training and test sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

    for index, c_value in enumerate(c_values):
        print('c =', c_value)
        # create models
        # no data scaling
        linear1 = SVC(C=c_value, kernel="linear")
        # standard scaling
        linear2 = make_pipeline(StandardScaler(), SVC(C=c_value, kernel="linear"))
        # min-max scaling
        linear3 = make_pipeline(MinMaxScaler(), SVC(C=c_value, kernel="linear"))

        # fit linear model
        linear1.fit(x_train, y_train)
        print('linear1 fitted')
        linear2.fit(x_train, y_train)
        print('linear2 fitted')
        linear3.fit(x_train, y_train)
        print('linear3 fitted')

        # compute accuracy
        train_acc = [linear1.score(x_train, y_train), linear2.score(x_train, y_train), linear3.score(x_train, y_train)]
        test_acc = [linear1.score(x_test, y_test), linear2.score(x_test, y_test), linear3.score(x_test, y_test)]

        # save results
        df.loc[("no scaling", c_value), ("linear", "", "train")] = train_acc[0]
        df.loc[("no scaling", c_value), ("linear", "", "test")] = test_acc[0]
        df.loc[("standard scaling", c_value), ("linear", "", "train")] = train_acc[1]
        df.loc[("standard scaling", c_value), ("linear", "", "test")] = test_acc[1]
        df.loc[("min-max scaling", c_value), ("linear", "", "train")] = train_acc[2]
        df.loc[("min-max scaling", c_value), ("linear", "", "test")] = test_acc[2]

        # change kernel
        for gamma_value, poly_degree in zip(gamma_values, poly_degrees):
            print('gamma =', gamma_value, 'degree =', poly_degree)
            # no data scaling
            poly1 = SVC(C=c_value, kernel="poly", gamma=gamma_value, degree=poly_degree)
            # standard scaling
            poly2 = make_pipeline(StandardScaler(), SVC(C=c_value, kernel="poly", degree=poly_degree))
            # min-max scaling
            poly3 = make_pipeline(MinMaxScaler(), SVC(C=c_value, kernel="poly", degree=poly_degree))

            # change kernel
            # no data scaling
            rbf1 = SVC(C=c_value, kernel="rbf", gamma=gamma_value)
            # standard scaling
            rbf2 = make_pipeline(StandardScaler(), SVC(C=c_value, kernel="rbf", gamma=gamma_value))
            # min-max scaling
            rbf3 = make_pipeline(MinMaxScaler(), SVC(C=c_value, kernel="rbf", gamma=gamma_value))

            # fit models
            poly1.fit(x_train, y_train)
            print('poly1 fitted')
            poly2.fit(x_train, y_train)
            print('poly2 fitted')
            poly3.fit(x_train, y_train)
            print('poly3 fitted')
            rbf1.fit(x_train, y_train)
            print('rbf1 fitted')
            rbf2.fit(x_train, y_train)
            print('rbf2 fitted')
            rbf3.fit(x_train, y_train)
            print('rbf3 fitted')

            # compute accuracy
            train_acc = [poly1.score(x_train, y_train), poly2.score(x_train, y_train), poly3.score(x_train, y_train),
                          rbf1.score(x_train, y_train), rbf2.score(x_train, y_train), rbf3.score(x_train, y_train)]
            test_acc = [poly1.score(x_test, y_test), poly2.score(x_test, y_test), poly3.score(x_test, y_test),
                          rbf1.score(x_test, y_test), rbf2.score(x_test, y_test), rbf3.score(x_test, y_test)]

            # save results
            df.loc[("no scaling", c_value), ("poly", poly_degree, "train")] = train_acc[0]
            df.loc[("no scaling", c_value), ("poly", poly_degree, "test")] = test_acc[0]
            df.loc[("standard scaling", c_value), ("poly", poly_degree, "train")] = train_acc[1]
            df.loc[("standard scaling", c_value), ("poly", poly_degree, "test")] = test_acc[1]
            df.loc[("min-max scaling", c_value), ("poly", poly_degree, "train")] = train_acc[2]
            df.loc[("min-max scaling", c_value), ("poly", poly_degree, "test")] = test_acc[2]
            df.loc[("no scaling", c_value), ("rbf", gamma_value, "train")] = train_acc[3]
            df.loc[("no scaling", c_value), ("rbf", gamma_value, "test")] = test_acc[3]
            df.loc[("standard scaling", c_value), ("rbf", gamma_value, "train")] = train_acc[4]
            df.loc[("standard scaling", c_value), ("rbf", gamma_value, "test")] = test_acc[4]
            df.loc[("min-max scaling", c_value), ("rbf", gamma_value, "train")] = train_acc[5]
            df.loc[("min-max scaling", c_value), ("rbf", gamma_value, "test")] = test_acc[5]

    # print results
    df.to_html(f"results.html {data_file}.html")
    print(df)