# Data understanding and preprocessing

### Data load

In [69]:
import numpy as np
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import make_scorer, zero_one_loss
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [70]:
y_train = np.loadtxt('y_train_binary.csv', delimiter=',')
y_test = np.loadtxt('y_test_binary.csv', delimiter=',')
X_train = np.loadtxt('X_train_binary.csv', delimiter=',')
X_test = np.loadtxt('X_test_binary.csv', delimiter=',')

In [71]:
print('X_train shape:', X_train.shape)
print(X_train[0])

X_train shape: (150, 61)
[ 2.3350e-01  0.0000e+00  7.5680e-02  1.6718e-01  3.3433e-01  5.6597e-01
  8.3368e-01 -2.6516e+00  2.8234e+00  1.2917e-01  7.7993e-02  4.4053e-02
  1.9141e-02  1.3193e+00 -1.5318e+00  2.9943e+00  1.5345e+00  6.8333e-03
  2.4296e-03  3.1153e-04  1.3678e-04  5.8885e+00 -3.2956e-01 -1.6802e-01
 -2.1903e+00  5.7364e-03  6.6731e-04  1.7311e-04  1.1276e-05  4.7518e+00
 -2.5333e+00  1.6317e+00 -9.6839e-01  5.2901e-02  4.4408e-02  4.4636e+00
  3.6019e+00  1.1983e+01  3.0226e-01  2.9189e-01  1.3080e-06  2.5472e-03
 -7.2104e-06  2.6000e+01  2.8000e-02  6.5919e-01  9.7047e-01  3.0958e-02
  2.9107e+00  6.7857e-01  2.3018e+00  3.1008e-01  2.9314e-01  2.8908e+00
  8.0693e-02  4.0705e-01  1.4590e+00  1.6782e+00  1.1629e-01 -2.5700e-01
  8.6146e-01]


### Data preprocessing

Get the frequencies of each class in the training set

In [72]:
class_counts = np.unique(y_train, return_counts=True)
total_data_points = len(y_train)

class_frequencies = class_counts[1] / total_data_points

for class_label, frequency in zip(class_counts[0], class_frequencies):
    print(f"Class {class_label}: {frequency:.2%}")


Class -1.0: 45.33%
Class 1.0: 54.67%


Transform data to zero mean and unit variance

In [73]:
print("Mean:", np.mean(X_train, axis=0))
print("Variance:", np.var(X_train, axis=0))


Mean: [ 3.12016667e-01  1.19002133e-03  8.88565533e-02  1.69619973e-01
  2.90525800e-01  4.93438333e-01  7.56135600e-01 -2.14976509e+00
  2.09830876e+00  1.11999747e-01  1.07688547e-01  5.00619593e-02
  4.75127227e-02  6.85888707e-01 -1.59077333e+00  1.90434667e-01
  1.52237987e+00  1.55371807e-02  2.57776455e-03  3.00196533e-03
  4.61159247e-04  1.98633014e+00 -2.47071107e-01  5.66055780e-02
 -4.41235733e-02  1.10457260e-02  2.56409825e-03  8.67068651e-04
  2.19285465e-04  2.85879909e+00  3.16466867e-02 -4.16886333e-02
 -2.71012920e-01  1.51912793e-01  1.13373287e-01  4.52113072e+00
  6.95909174e+00  1.49569067e+01  4.53269133e-01  3.68771400e-01
  3.97319200e-06  3.89819683e-03  3.48878000e-07  1.21960000e+02
  4.08200000e-02  6.38288667e-01  1.27133513e+00  6.08435527e-02
  1.58987833e+00  6.07201667e-01  1.76779860e+00  3.70571867e-01
  3.52433333e-01  3.00013533e+00  3.23182422e-01  4.18279340e-01
  1.69910800e+00  3.15320498e+00  1.39558713e-01 -1.46031785e-01
  8.09072933e-01]
V

In [74]:
def fnorm(X):
    mean = np.mean(X, axis=0)
    std = np.std(X, axis=0)
    X_normalized = (X - mean) / std
    return X_normalized

X_train_normalized = fnorm(X_train)

# Compute the mean
mean = np.mean(X_train_normalized, axis=0)

# Compute the variance
variance = np.var(X_train_normalized, axis=0)

print("Mean:", mean)
print("Variance:", variance)

Mean: 8.035732067613918e-17
Variance: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


# Model selection using grid-search

In [75]:
# Define the parameter grid
param_grid = {'C': [0.1, 1, 10, 100], 
              'gamma': [0.1, 1, 10, 100]}

# Define the model to use
svm = SVC()

# greater_is_better = False, because we want to minimize the 0-1 loss
custom_scorer = make_scorer(zero_one_loss, greater_is_better=False)

# Create the GridSearchCV object with zero_one_loss
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring=custom_scorer)

# Fit the data to perform grid search
grid_search.fit(X_train_normalized, y_train)

# Report results
print("Best Parameters:", grid_search.best_params_)
# I negated the score when setting up the greater_is_better=False, so I need to un-negate it. Hence the minus sign.
print("Best Score (0-1 loss):", -grid_search.best_score_) 


Best Parameters: {'C': 10, 'gamma': 0.1}
Best Score (0-1 loss): 0.20666666666666664
