# Data understanding and preprocessing

### Data load

In [8]:
import numpy as np
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import make_scorer, zero_one_loss
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [9]:
y_train = np.loadtxt('data/svm_binary/y_train_binary.csv', delimiter=',')
y_test = np.loadtxt('data/svm_binary/y_test_binary.csv', delimiter=',')
X_train = np.loadtxt('data/svm_binary/X_train_binary.csv', delimiter=',')
X_test = np.loadtxt('data/svm_binary/X_test_binary.csv', delimiter=',')

In [10]:
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print(X_train[0])

X_train shape: (150, 61)
X_test shape: (164, 61)
[ 2.3350e-01  0.0000e+00  7.5680e-02  1.6718e-01  3.3433e-01  5.6597e-01
  8.3368e-01 -2.6516e+00  2.8234e+00  1.2917e-01  7.7993e-02  4.4053e-02
  1.9141e-02  1.3193e+00 -1.5318e+00  2.9943e+00  1.5345e+00  6.8333e-03
  2.4296e-03  3.1153e-04  1.3678e-04  5.8885e+00 -3.2956e-01 -1.6802e-01
 -2.1903e+00  5.7364e-03  6.6731e-04  1.7311e-04  1.1276e-05  4.7518e+00
 -2.5333e+00  1.6317e+00 -9.6839e-01  5.2901e-02  4.4408e-02  4.4636e+00
  3.6019e+00  1.1983e+01  3.0226e-01  2.9189e-01  1.3080e-06  2.5472e-03
 -7.2104e-06  2.6000e+01  2.8000e-02  6.5919e-01  9.7047e-01  3.0958e-02
  2.9107e+00  6.7857e-01  2.3018e+00  3.1008e-01  2.9314e-01  2.8908e+00
  8.0693e-02  4.0705e-01  1.4590e+00  1.6782e+00  1.1629e-01 -2.5700e-01
  8.6146e-01]


### Data preprocessing

Get the frequencies of each class in the training set

In [11]:
class_counts = np.unique(y_train, return_counts=True)
total_data_points = len(y_train)

class_frequencies = class_counts[1] / total_data_points

for class_label, frequency in zip(class_counts[0], class_frequencies):
    print(f"Class {class_label}: {frequency:.2%}")


Class -1.0: 45.33%
Class 1.0: 54.67%


Transform data to zero mean and unit variance

In [12]:
print("Mean:", np.mean(X_train, axis=0))
print("Variance:", np.var(X_train, axis=0))


Mean: [ 3.12016667e-01  1.19002133e-03  8.88565533e-02  1.69619973e-01
  2.90525800e-01  4.93438333e-01  7.56135600e-01 -2.14976509e+00
  2.09830876e+00  1.11999747e-01  1.07688547e-01  5.00619593e-02
  4.75127227e-02  6.85888707e-01 -1.59077333e+00  1.90434667e-01
  1.52237987e+00  1.55371807e-02  2.57776455e-03  3.00196533e-03
  4.61159247e-04  1.98633014e+00 -2.47071107e-01  5.66055780e-02
 -4.41235733e-02  1.10457260e-02  2.56409825e-03  8.67068651e-04
  2.19285465e-04  2.85879909e+00  3.16466867e-02 -4.16886333e-02
 -2.71012920e-01  1.51912793e-01  1.13373287e-01  4.52113072e+00
  6.95909174e+00  1.49569067e+01  4.53269133e-01  3.68771400e-01
  3.97319200e-06  3.89819683e-03  3.48878000e-07  1.21960000e+02
  4.08200000e-02  6.38288667e-01  1.27133513e+00  6.08435527e-02
  1.58987833e+00  6.07201667e-01  1.76779860e+00  3.70571867e-01
  3.52433333e-01  3.00013533e+00  3.23182422e-01  4.18279340e-01
  1.69910800e+00  3.15320498e+00  1.39558713e-01 -1.46031785e-01
  8.09072933e-01]
V

In [13]:
def fnorm(X):
    mean = np.mean(X, axis=0)
    std = np.std(X, axis=0)
    X_normalized = (X - mean) / std
    return X_normalized

X_train_normalized = fnorm(X_train)

# Compute the mean
mean = np.mean(X_train_normalized, axis=0)

# Compute the variance
variance = np.var(X_train_normalized, axis=0)

print("Mean:", mean)
print("Variance:", variance)

Mean: [ 1.77635684e-17  7.10542736e-17 -7.32747196e-17  9.66634180e-16
  9.09642732e-16  9.32587341e-17 -1.01548399e-15  1.62832710e-16
  5.76575824e-16 -2.44249065e-16 -9.39988828e-16 -1.74675089e-16
 -3.25665421e-17 -5.19954450e-17 -2.42028619e-15  1.48029737e-18
  2.82292708e-15  1.19163938e-16  5.55111512e-17  1.30636243e-16
 -1.73934941e-17  7.69754630e-17 -2.81256500e-17 -4.32986980e-17
 -2.66453526e-17 -4.46309656e-16  5.62512999e-17  2.43508917e-16
  4.10782519e-17  1.65793305e-16  2.96059473e-17  1.77635684e-17
 -7.40148683e-18 -2.10757338e-16  6.49480469e-16 -2.07241631e-17
  2.63122857e-16 -5.18104078e-17  1.12391578e-15 -1.03102712e-15
 -1.83556873e-16 -8.28966525e-17  2.96059473e-17  7.03141249e-17
 -3.61192557e-16  1.73194792e-15 -1.55581566e-15 -2.54056035e-16
 -7.53101285e-16  4.76655752e-16  1.15537209e-15 -2.11682523e-15
  1.06581410e-16  1.58243788e-15  7.28306304e-16 -2.87917838e-16
  3.30106313e-16  5.75835675e-16 -1.98359847e-15 -2.69414121e-16
  4.20404452e-15]
V

# Model selection using grid-search

In [14]:
# Define the parameter grid
param_grid = {'C': [0.1, 1, 10, 100], 
              'gamma': [0.01, 0.1, 1, 10, 100]}
svm = SVC()

# greater_is_better = False, because we want to minimize the 0-1 loss
custom_scorer = make_scorer(zero_one_loss, greater_is_better=False)
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring=custom_scorer)

# Fit the data to perform grid search
grid_search.fit(X_train_normalized, y_train)

# Report results
print("Best Parameters:", grid_search.best_params_)
# I negated the score when setting up the greater_is_better=False, so I need to un-negate it. Hence the minus sign.
print("Best Score (0-1 loss):", -grid_search.best_score_) 


Best Parameters: {'C': 1, 'gamma': 0.01}
Best Score (0-1 loss): 0.14666666666666664


In [15]:
from sklearn.metrics import zero_one_loss
## Evaluation
test_score = grid_search.best_estimator_.score(fnorm(X_test), y_test)
print("Test Score:", test_score)
# Calculate zero-one loss (Could also have subtracted the test_score from 1)
y_pred = grid_search.best_estimator_.predict(fnorm(X_test))
zero_one_loss_value = zero_one_loss(y_test, y_pred)
print("Zero-One Loss:", zero_one_loss_value)



Test Score: 0.7987804878048781
Zero-One Loss: 0.2012195121951219


# Support vectors

In [16]:
C_values = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
gamma_value = 0.1  # Set your desired value for gamma
for elm in C_values:
    svm = SVC(C=elm, gamma=gamma_value)
    svm.fit(X_train_normalized, y_train)
    
    n_support_vectors = svm.n_support_ # Count the number of support vectors per class
    support_vector_indices = svm.support_ # Get indices of support vectors

    # Count the number of free and bounded support vectors
    n_free = len(support_vector_indices) - sum(n_support_vectors)
    print(support_vector_indices, n_support_vectors)
    n_bounded_support_vectors = sum(n_support_vectors)

    print("C:", elm)
    print("Number of Free Support Vectors:", n_free)
    print("Number of Bounded Support Vectors:", n_bounded_support_vectors)
    print(" ")

[  1   2   4   6   8   9  11  12  13  19  24  25  30  33  35  39  41  43
  44  45  47  50  51  52  53  55  56  57  59  60  63  65  67  70  73  77
  79  87  90  91  92  94  95  98 100 101 105 109 110 111 118 120 123 126
 127 128 130 131 134 135 136 137 139 141 143 145 147 148   0   3   5   7
  10  14  15  16  17  18  21  22  23  26  27  28  29  31  32  34  36  37
  38  40  42  46  48  49  58  61  64  66  68  71  72  74  75  76  80  81
  84  85  86  88  89  93  96  99 102 103 104 106 107 108 114 115 116 119
 121 124 125 129 132 138 140 142 146 149] [68 68]
C: 0.0001
Number of Free Support Vectors: 0
Number of Bounded Support Vectors: 136
 
[  1   2   4   6   8   9  11  12  13  19  24  25  30  33  35  39  41  43
  44  45  47  50  51  52  53  55  56  57  59  60  63  65  67  70  73  77
  79  87  90  91  92  94  95  98 100 101 105 109 110 111 118 120 123 126
 127 128 130 131 134 135 136 137 139 141 143 145 147 148   0   3   5   7
  10  14  15  16  17  18  20  21  22  23  26  27  28  29  31  