# Линейный SVM "своими руками"

## Генерируем обучающую и тестовую выборку для экспериментов

In [19]:
from sklearn.model_selection import train_test_split
from sklearn import datasets

X, y = datasets.make_classification(
    n_samples=10000, n_features=20, 
    n_classes=2, n_informative=20, 
    n_redundant=0,
    random_state=42
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,
    random_state=42
)

print(len(X), len(y))
print(len(X_train))

(10000, 10000)
8000


## Пишем свой класс для SVM

In [20]:
import numpy as np
from random import randint
import random


np.random.seed(42)
random.seed(42)


class MySVM(object):
    def __init__(self, C=10000):
        self.C = C # regularization constant

    # f(x) = <w,x> + w_0
    def f(self, x):
        return np.dot(self.w, x) + self.w0

    # a(x) = [f(x) > 0]
    def a(self, x):
        return 1 if self.f(x) > 0 else 0
    
    # predicting answers for X_test
    def predict(self, X_test):
        return np.array([self.a(x) for x in X_test])

    # l2-regularizator
    def reg(self):
        return 1.0 * sum(self.w ** 2) / (2.0 * self.C)

    # l2-regularizator derivative
    def der_reg(self):
        return sum(self.w) / self.C

    # hinge loss
    def loss(self, x, answer):
        return max([0, 1 - answer * self.f(x)])

    # hinge loss derivative
    def der_loss(self, x, answer):
        if answer * self.f(x) > 1:
            der_w0 = -answer 
            der_w = -answer * x 
            return der_w, der_w0
        else:
            return np.zeros(len(x)), 0

    # fitting w and w_0 with SGD
    def fit(self, X_train, y_train):
        dim = len(X_train[0])
        self.w = np.random.rand(dim) # initial value for w
        self.w0 = np.random.randn() # initial value for w_0
        
        # 10000 steps is OK for this example
        # another variant is to continue iterations while error is still decreasing
        for k in range(10000):  
            
            # random example choise
            rand_index = randint(0, len(X_train) - 1) # generating random index
            x = X_train[rand_index]
            y = y_train[rand_index]

            # simple heuristic for step size
            step = 0.5 * 0.9 ** k

            der_w, der_w0 = self.der_loss(x, y)
            # w update
            self.w -= step * (der_w + self.der_reg())
            
            # w_0 update
            self.w0 -= step * der_w0

## Пробуем обучить наш классификатор и посмотреть на качество на тесте

In [21]:
model = MySVM()
model.fit(X_train, y_train)
print(model.w, model.w0)

(array([ 0.66966006,  2.95965012,  3.72553641,  3.91472856,  3.03482712,
       -1.70344518, -0.87481388,  2.9114534 , -1.44735164, -2.65290156,
        1.37024994,  2.34363817,  2.00840403,  0.20514092, -0.56077629,
       -1.08041182,  2.61913045,  2.58229706,  1.18868471, -0.03103025]), 0.2903442463743873)


In [22]:
predictions = model.predict(X_train)

In [23]:
print(classification_report(y_train, predictions))

             precision    recall  f1-score   support

          0       0.60      0.47      0.53      3997
          1       0.57      0.69      0.62      4003

avg / total       0.59      0.58      0.58      8000



In [24]:
predictions = model.predict(X_test)
print(sum(predictions == y_test) / float(len(y_test)))

0.5765


## Задания:

### - Допишите недостающие функции в MySVM (производные и обновление весов)

### - Сравните качество с sklearn LinearSVC

In [25]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
svc = LinearSVC(C=1)
svc.fit(X_train, y_train);
print(svc.coef_, svc.intercept_)

print(classification_report(y_train, svc.predict(X_train)))

(array([[-0.02969484, -0.05816991,  0.00217611,  0.05200691,  0.01940899,
         0.11378511, -0.00354493,  0.04061167, -0.06001035, -0.05742928,
         0.0613028 ,  0.06506545,  0.10262898,  0.04573795, -0.10370597,
         0.01575548,  0.06734485,  0.07133172, -0.0013408 ,  0.09318223]]), array([-0.01872142]))
             precision    recall  f1-score   support

          0       0.80      0.80      0.80      3997
          1       0.80      0.80      0.80      4003

avg / total       0.80      0.80      0.80      8000



In [None]:
def score(y, y_pred, **kargs):
    recall = recall_score(y, y_pred)
    precision = precision_score(y, y_pred)
    if precision < 1.5 * recall and recall > 0.5:
        return  precision
    return 0