Load data

In [121]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def load_spambase():
    spambase = pd.read_csv('spambase.data')
    spambase_y = spambase.pop('is_spam')
    spambase = StandardScaler().fit(spambase).transform(spambase)
    X_train, X_test, y_train, y_test = train_test_split(spambase, spambase_y, test_size=0.8, random_state=3000)
    return X_train, y_train, X_test, y_test

def load_perceptron_data():
    perceptron_dataset = pd.read_csv('perceptronData.txt', delim_whitespace=True, header=None)
    perceptron_labels = perceptron_dataset.pop(4)
    X_train, X_test, y_train, y_test = train_test_split(perceptron_dataset, perceptron_labels, test_size=0.2, random_state=3000)
    return X_train, y_train, X_test, y_test

def load_two_spirals():
    perceptron_dataset = pd.read_csv('twoSpirals.txt', delim_whitespace=True, header=None)
    perceptron_labels = perceptron_dataset.pop(2)
    X_train, X_test, y_train, y_test = train_test_split(perceptron_dataset, perceptron_labels, test_size=0.2, random_state=3000)
    return X_train, y_train, X_test, y_test

In [122]:
sp_X_tr, sp_y_tr, sp_X_te, sp_y_te = load_spambase()
perc_X_train, perc_y_train, perc_X_test, perc_y_test = load_perceptron_data()
spir_X_train, spir_y_train, spir_X_test, spir_y_test = load_two_spirals()

In [114]:
sp_y_tr

3762    0
2866    0
265     1
3671    0
2880    0
       ..
1876    0
949     1
1966    0
3841    0
1016    1
Name: is_spam, Length: 920, dtype: int64

Kernels

In [64]:
def linear_kernel(x1, x2):
    return np.dot(x1, x2)

def gaussian_kernel(x1, x2, gamma=1):
    distance = np.linalg.norm(x1 - x2) ** 2
    return np.exp(-gamma * distance)

Helper Functions

In [127]:
def compare_results(pred_y, y, neg_label = -1):
    df = pd.DataFrame({'preds': pred_y, 'labels': y})
    tp = ((df['labels'] == 1) & (df['preds'] == 1)).sum()
    tn = ((df['labels'] == neg_label) & (df['preds'] == neg_label)).sum()
    fp = ((df['labels'] == neg_label) & (df['preds'] == 1)).sum()
    fn = ((df['labels'] == 1) & (df['preds'] == neg_label)).sum()
    print(f'Accuracy: {(tp + tn) / len(y)}, Error: {(fp + fn) / len(y)}')
    print(f'TPR: {tp / (tp + fn)}, FPR: {fp / (fp + tn)}')
    return

### Problem 1

In [115]:
class KernelDensity:
    def __init__(self, kernel):
        self.kernel = kernel
        return
    
    def train(self, X, y):
        pos_ind = np.where(y == 1)
        neg_ind = np.where(y == 0)
        self.pos_X = X[pos_ind]
        self.neg_X = X[neg_ind]
        return
    
    def _kernel_matrix(self, X, Xi):
        sim = np.zeros(X.shape[0])
        for i in range(X.shape[0]):
            sim[i] = self.kernel(X[i], Xi)
        return sim
    
    def predict(self, X):
        preds = np.zeros(X.shape[0])
        for i in range(X.shape[0]):
            pos = np.sum(self._kernel_matrix(self.pos_X, X[i])) / self.pos_X.shape[0]
            neg = np.sum(self._kernel_matrix(self.neg_X, X[i])) / self.neg_X.shape[0]
            preds[i] = 1 if pos > neg else 0
        return preds
        
        

In [116]:
kd = KernelDensity(linear_kernel)
kd_data = np.array([[3,3],[2,3],[2,2], [2,4],[-3,-3],[-2,-3],[-2,-2], [-2,-4]])
kd_label = np.array([1,1,1,1,0,0,0,0])
kd.train(kd_data, kd_label)
preds = kd.predict(np.array([[2,2], [5, 5], [-3, -3], [-3, 3]]))
preds

array([1., 1., 0., 1.])

In [128]:
kd_sp = KernelDensity(gaussian_kernel)
kd_sp.train(sp_X_tr, sp_y_tr)
kd_sp_preds = kd_sp.predict(sp_X_te)
compare_results(kd_sp_preds, sp_y_te, 0)

Accuracy: 0.8785656071719642, Error: 0.12143439282803586
TPR: 0.8390177353342428, FPR: 0.09525959367945824


In [118]:
kd_sp_preds

array([0., 0., 1., ..., 1., 0., 0.])

### Problem 2

In [97]:
class KernelPerceptron:
    def __init__(self, kernel, max_iter=200):
        self.kernel = kernel
        self.max_iter = max_iter
        return
    
    def train(self, X, y):
        self.X = X
        self.y = y
        self.alphas = np.zeros(X.shape[0])
        for iter_i in range(self.max_iter):
            round_mistakes = 0
            for i in range(X.shape[0]):
                pred = np.sign(np.sum(self.alphas * y * self._kernel_matrix(X[i])))
                if pred != y[i]:
                    round_mistakes += 1
                    self.alphas[i] += 1
            print(f'Iteration: {iter_i}, Mistakes: {round_mistakes}')
            if round_mistakes == 0:
                return
        return
    
    def _kernel_matrix(self, Xi):
        sim = np.zeros(self.X.shape[0])
        for i in range(self.X.shape[0]):
            sim[i] = self.kernel(self.X[i], Xi)
        return sim
                    
    def predict(self, X):
        preds = np.zeros(X.shape[0])
        for i in range(X.shape[0]):
            preds[i] = np.sign(np.sum(self.alphas * self.y * self._kernel_matrix(X[i])))
        return preds

In [98]:
test = KernelPerceptron(gaussian_kernel)
test.train(perc_X_train.to_numpy(), perc_y_train.to_numpy())
preds = test.predict(perc_X_test.to_numpy())
compare_results(preds, perc_y_test.to_numpy())

Iteration: 0, Mistakes: 75
Iteration: 1, Mistakes: 29
Iteration: 2, Mistakes: 9
Iteration: 3, Mistakes: 12
Iteration: 4, Mistakes: 15
Iteration: 5, Mistakes: 4
Iteration: 6, Mistakes: 0
Accuracy: 0.995, Error: 0.005
TPR: 1.0, FPR: 0.008928571428571428


In [91]:
test2 = KernelPerceptron(gaussian_kernel)
test2.train(spir_X_train.to_numpy(), spir_y_train.to_numpy())
preds2 = test2.predict(spir_X_test.to_numpy())
compare_results(preds2, spir_y_test.to_numpy())

Iteration: 0, Mistakes: 37
Iteration: 1, Mistakes: 0
Accuracy: 0.99, Error: 0.01
TPR: 1.0, FPR: 0.020202020202020204
