# Линейный SVM "своими руками"

## Генерируем обучающую и тестовую выборку для экспериментов

In [1]:
from sklearn.model_selection import train_test_split
from sklearn import datasets
X, y = datasets.make_classification(n_samples = 10000, n_features=20, n_classes=2, n_informative=20,
                                    n_redundant=0, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



print(len(X), len(y))
print(len(X_train))

(10000, 10000)
8000


## Пишем свой класс для SVM

In [8]:
import numpy as np
from random import randint
import random


np.random.seed(42)
random.seed(42)


class MySVM(object):
    def __init__(self, C=10000):
        self.C = C # regularization constant

    # f(x) = <w,x> + w_0
    def f(self, x):
        return np.dot(self.w, x) + self.w0

    # a(x) = [f(x) > 0]
    def a(self, x):
        return 1 if self.f(x) > 0 else 0
    
    # predicting answers for X_test
    def predict(self, X_test):
        return np.array([self.a(x) for x in X_test])

    # l2-regularizator
    def reg(self):
        return 1.0 * sum(self.w ** 2) / (2.0 * self.C)

    # l2-regularizator derivative
    def der_reg(self):
        return sum(self.w)/ self.C

    # hinge loss
    def loss(self, x, answer):
        return max([0, 1 - answer * self.f(x)])

    # hinge loss derivative
    def der_loss(self, x, answer):
        return -1.0 if 1.0 -answer *self.f(x) < 0 else 0.0

    # fitting w and w_0 with SGD
    def fit(self, X_train, y_train):
        dim = len(X_train[0])
        self.w = np.random.rand(dim) # initial value for w
        self.w0 = np.random.randn() # initial value for w_0
        
        # 10000 steps is OK for this example
        # another variant is to continue iterations while error is still decreasing
        for k in range(10000):  
            
            # random example choise
            rand_index = randint(0, len(X_train) - 1) # generating random index
            x = X_train[rand_index]
            y = y_train[rand_index]

            # simple heuristic for step size
            step = 0.5 * 0.9 ** k

            # w update
            self.w -= x * y * step * self.der_loss(x,y)
            
            # w_0 update
            self.w0 -= y * step * self.der_loss(x,y)

## Пробуем обучить наш классификатор и посмотреть на качество на тесте

In [9]:
model = MySVM()
model.fit(X_train, y_train)
print(model.w, model.w0)

(array([ 0.67614757,  2.96613763,  3.73202392,  3.92121607,  3.04131463,
       -1.69695767, -0.86832637,  2.91794091, -1.44086413, -2.64641404,
        1.37673745,  2.35012568,  2.01489154,  0.21162843, -0.55428878,
       -1.07392431,  2.62561796,  2.58878457,  1.19517222, -0.02454274]), 0.29034424637438372)


In [10]:
predictions = model.predict(X_test)

In [11]:
print(predictions)

[0 0 1 ..., 0 1 1]


In [12]:
print(y_test, len(y_test), sum(y_test))

(array([1, 0, 1, ..., 1, 0, 1]), 2000, 991)


In [13]:
print(len(predictions), sum(predictions))

(2000, 1257)


In [14]:
print(sum(predictions == y_test) / float(len(y_test)))

0.577


## Задания:

### - Допишите недостающие функции в MySVM (производные и обновление весов)

### - Сравните качество с sklearn LinearSVC

In [20]:
from sklearn.svm import SVC

In [21]:
svc = SVC(kernel='linear')
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [24]:
pred = svc.predict(X_test)
print len(pred), sum(pred)

2000 1022


In [29]:
from sklearn.metrics import accuracy_score 

In [31]:
score = accuracy_score(predictions, y_test)

In [32]:
score

0.57699999999999996