In [None]:
import pandas as pd
import numpy as np
import jupyter_black

jupyter_black.load()
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [151]:
# Load dataframe with breast cancer data

columns = list(np.arange(0, 32, 1))
df = pd.read_csv("wdbc.data.csv", names=columns)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [152]:
# Encode class column as -1 / 1 to fit SVM algorithm
# -1 is malignant and 1 is benign

df[1] = np.where(df[1] == "M", -1, 1)

In [171]:
# Shuffle data in the data frame before train / test split

df_shuffled = df.sample(frac=1)

In [176]:
# Create 569 x 1 class label array
# Split into train and test sets

Y = df_shuffled[1].to_numpy()
Y_train = Y[:500]
Y_test = Y[500:]

In [185]:
# Create 569 x 30 covariate matrix
# Each column represents one observation with 30 covariates

X = df_shuffled.drop(columns=[0, 1]).to_numpy()

# Perform normalization to speed up computations
X = Normalizer().fit_transform(X)

# Split into train and test sets
X_train = X[:500].T
X_test = X[500:].T

In [201]:
# Create 30 x 1 weight vector with initial weights
# Also create 569 x 1 intercept / bias vector with initial value

W = np.random.normal(loc=0, scale=0.1, size=X_train.shape[0])
b_train = np.ones(X_train.shape[1])
b_test = np.ones(X_test.shape[1])

In [193]:
# Transform class label vector into triangular matrix with labels on diagonal

Y_triang = np.zeros((X_train.shape[1], X_train.shape[1]))
np.fill_diagonal(Y_triang, Y_train)

In [None]:
# Initialize 569 x 1 error / residual vector, and penalty term C

# epsilon_vec = np.zeros(X.shape[1])
C = 1000

In [202]:
# Check that we have right shapes

Y_train.shape, Y_test.shape, Y_triang.shape, X_train.shape, X_test.shape, W.shape, b_train.shape, b_test.shape

((500,), (69,), (500, 500), (30, 500), (30, 69), (30,), (500,), (69,))

In [195]:
def calculate_value_per_obs(W):

    # Predicted value for each x_i with current weights, times actual class label
    pred_actual_val = (W.T @ X_train + b_train) @ Y_triang

    return pred_actual_val

In [196]:
def calculate_epsilon(W):

    # Predicted value for each x_i with current weights, times actual class label
    pred_actual_val = calculate_value_per_obs(W)

    # Epsilon value for each x_i on the wrong side of support vector
    epsilon_vec = np.where(pred_actual_val < 1, 1 - pred_actual_val, 0)

    return epsilon_vec

In [197]:
def objective_function(W):  # , C, epsilon_vec):

    # Call function to calculate value per observation
    pred_actual_val = calculate_value_per_obs(W)

    # Call function to calculate current epsilon vector
    epsilon_vec = calculate_epsilon(W)

    # Calculate value of function for current weights and deviations
    function_val = 0.5 * (W.T @ W) + C * np.sum(epsilon_vec)

    return function_val

In [198]:
def constraint_function(W):  # , X, b, Y_triang):

    # Call function to calculate value per observation
    pred_actual_val = calculate_value_per_obs(W)

    # Call function to calculate current epsilon vector
    epsilon_vec = calculate_epsilon(W)

    # Difference between these
    constraint = pred_actual_val - 1 + epsilon_vec

    return constraint

In [203]:
# Set up and run the optimization

epsilon_cons = {"type": "ineq", "fun": constraint_function}

optim = optimize.minimize(
    fun=objective_function,
    x0=W,
    method="SLSQP",
    constraints=epsilon_cons,
    options={"maxiter": 1000},
)

# Store final 
W_final = optim.x
optim

     fun: 90201.9043717062
     jac: array([-1.32832031e+01, -1.95585938e+01, -8.72695312e+01, -9.15634766e+01,
       -2.03125000e-01, -2.35351562e-01, -2.22656250e-01, -1.18164062e-01,
       -2.50000000e-01, -9.37500000e-02, -4.17968750e-01, -1.70605469e+00,
       -2.27343750e+00, -3.44638672e+01, -1.26953125e-02, -3.32031250e-02,
       -2.92968750e-02, -9.76562500e-03, -7.81250000e-03, -2.92968750e-03,
       -1.59199219e+01, -2.62861328e+01, -1.02220703e+02, -2.07615234e+02,
       -2.51953125e-01, -3.77929688e-01, -5.51757812e-01, -1.30859375e-01,
       -2.94921875e-01, -1.51367188e-01])
 message: 'Optimization terminated successfully'
    nfev: 2132
     nit: 67
    njev: 67
  status: 0
 success: True
       x: array([ 2.75394944e+01,  1.11984569e+01,  8.00488061e+01,  5.34444277e+00,
       -5.88177543e-01, -2.73334216e+00, -3.92686340e+00, -1.65935811e+00,
       -7.40623711e-01, -1.41446564e-01,  5.95010437e-01, -8.61409557e-01,
       -8.24309243e+00, -1.36459299e+01, -6.

In [205]:
# Make predictions on test and train set

pred_test = np.where(W_final @ X_test >= 0, 1, -1)
pred_train = np.where(W_final @ X_train >= 0, 1, -1)

In [206]:
# Evaluate accuracy of predictions

test_accuracy = np.mean(pred_test == Y_test)
train_accuracy = np.mean(pred_train == Y_train)
test_accuracy, train_accuracy

(0.855072463768116, 0.888)

In [208]:
# Compare to sklearn SVC estimator

target = df[1].to_numpy()
features = df.drop(columns=[0, 1]).to_numpy()
features = Normalizer().fit_transform(features)

x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=69)

svc = SVC()
svc.fit(x_train, y_train)
pred_test_skl = svc.predict(x_test)
pred_train_skl = svc.predict(x_train)

test_accuracy_skl = np.mean(pred_test_skl == y_test)
train_accuracy_skl = np.mean(pred_train_skl == y_train)
test_accuracy_skl, train_accuracy_skl

(0.927536231884058, 0.888)