In [1]:
from zlib import crc32
import numpy as np

types = ['regression', 'classification']
datasets = {'regression': [{'name': 'Servo Data Set',
                            'url': 'https://archive.ics.uci.edu/ml/datasets/Servo'}, 
                           {'name': 'Forest Fires Data Set',
                            'url': 'https://archive.ics.uci.edu/ml/datasets/Forest+Fires'},
                           {'name': 'Boston Housing Data Set',
                            'url': 'https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_boston.html#sklearn.datasets.load_boston'}, 
                           {'name': 'Diabetes Data Set',
                            'url': 'https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html#sklearn.datasets.load_diabetes'}],
            'classification': [{'name': 'Spambase Data Set',
                                'url': 'https://archive.ics.uci.edu/ml/datasets/Spambase'}, 
                               {'name': 'Wine Data Set',
                                'url': 'https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_wine.html#sklearn.datasets.load_wine'}, 
                               {'name': 'Breast Cancer Data Set',
                                'url': 'https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html#sklearn.datasets.load_breast_cancer'}, 
                               {'name': 'MNIST',
                                'url': 'http://yann.lecun.com/exdb/mnist/'}]}
methods = {'regression': ['Линейная регрессия', 
                          'Перцептрон', 
                          'Надарая-Ватсона',
                          'SVR'],
           'classification': ['Логистическая регрессия', 
                              'Перцептрон', 
                              'k-ближайших соседей',
                              'Метод потенциальных функций',
                              'Метод Парзеновского окна', 
                              'SVM']}
task = dict()
task['mail'] = input(prompt='Enter your mail: ')
task['id'] = crc32(task['mail'].encode('utf-8'))
np.random.seed(task['id'])
task['type'] = np.random.choice(types)
task['dataset'] = np.random.choice(datasets[task['type']])
task['method'] = np.random.choice(
    methods[task['type']], size=3, replace=False).tolist()


task

Enter your mail: dudenko.ei@phystech.edu


{'mail': 'dudenko.ei@phystech.edu',
 'id': 3711759380,
 'type': 'classification',
 'dataset': {'name': 'Breast Cancer Data Set',
  'url': 'https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html#sklearn.datasets.load_breast_cancer'},
 'method': ['SVM', 'Метод потенциальных функций', 'Перцептрон']}

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import svm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from matplotlib import gridspec
from mlxtend.plotting import plot_decision_regions
from scipy.spatial.distance import cdist
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import Perceptron

In [5]:
cancer = load_breast_cancer()
df = pd.DataFrame(np.c_[cancer['data'], cancer['target']],
                   columns= np.append(cancer['feature_names'], ['target']))
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0


In [6]:
Y = df['target']
X = df.drop(['target'], axis=1)

Y = Y.astype(np.int_)

for key in X.keys():
    max_it = X[key].max()
    X[key] = X[key] / max_it
    
X.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,0.639986,0.264257,0.651459,0.40024,0.724602,0.803706,0.70314,0.731113,0.795724,0.807779,...,0.704218,0.349818,0.734873,0.474612,0.728661,0.629112,0.56861,0.912027,0.69313,0.573012
1,0.731768,0.452393,0.70504,0.530188,0.518605,0.227678,0.203608,0.348757,0.596053,0.581589,...,0.693396,0.472547,0.632166,0.459803,0.556155,0.176371,0.192971,0.639175,0.414281,0.429012
2,0.700462,0.540988,0.689655,0.481008,0.670747,0.462942,0.462512,0.635686,0.680592,0.615661,...,0.653996,0.515341,0.607086,0.40174,0.648697,0.401229,0.359744,0.835052,0.54429,0.422072
3,0.406261,0.518839,0.411565,0.154378,0.872093,0.821946,0.565604,0.522863,0.854276,1.0,...,0.413707,0.534921,0.393591,0.133451,0.942498,0.818809,0.548642,0.88488,1.0,0.833735
4,0.721807,0.365071,0.716711,0.518593,0.613831,0.384482,0.463918,0.51839,0.595066,0.603756,...,0.625416,0.336496,0.605892,0.37024,0.617251,0.193762,0.319489,0.558419,0.356131,0.370024


In [7]:
print('Размер выборки составляет l={} объектов.'.format(len(df)))


Размер выборки составляет l=569 объектов.


In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.7, random_state=5)

## Метод SVM 

In [9]:
param_grid1 = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']},
  {'C': [1, 10, 100, 1000], 'degree': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['poly']},
  {'C': [1, 10, 100, 1000], 'kernel': ['sigmoid']}
 ]

svc = svm.SVC()
grid = GridSearchCV(svc, param_grid1)
grid.fit(X_train, Y_train)
print(grid.best_params_) 
grid_predictions = grid.predict(X_test)
print(classification_report(Y_test, grid_predictions))

{'C': 10, 'gamma': 1, 'kernel': 'rbf'}
              precision    recall  f1-score   support

           0       1.00      0.97      0.98        61
           1       0.98      1.00      0.99       110

    accuracy                           0.99       171
   macro avg       0.99      0.98      0.99       171
weighted avg       0.99      0.99      0.99       171



Для функции SVC регуляризацией будет по умолчанию l2. Регуляризацию можно подобрать для линейного ядра в функции LinearSVC.
Лучшее ядро получилось rbf: C=10, gamma=1; второе linear C=10. 

### rbf kernel

In [10]:
param_grid2 = [
  {'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15], 'gamma': [0.2, 0.4, 0.5, 0.6, 0,7, 0.9, 1, 1.1, 1.2], 'kernel': ['rbf']}
 ]

svc = svm.SVC()
grid = GridSearchCV(svc, param_grid2)
grid.fit(X_train, Y_train)
print(grid.best_params_) 

{'C': 4, 'gamma': 0.9, 'kernel': 'rbf'}


In [11]:
model = svm.SVC(kernel='rbf', C=4, gamma=0.9)
_ = model.fit(X_train, Y_train)
Y_test_pred = model.predict(X_test)

print("Accuracy: ", model.score(X_test, Y_test))
print("Mse:", mean_squared_error(Y_test, Y_test_pred))

Accuracy:  0.9883040935672515
Mse: 0.011695906432748537


### linear kernel

In [12]:
param_grid3 = [
  {'C': [1, 4, 5, 10, 15, 20, 25, 30, 35, 40, 50]}
 ]

svc = svm.LinearSVC(penalty='l2')
grid = GridSearchCV(svc, param_grid3, scoring='accuracy')
grid.fit(X_train, Y_train)
print(grid.best_params_) 

{'C': 35}


In [13]:
C = grid.best_params_['C']
model = svm.LinearSVC(penalty='l2', C=C)
_ = model.fit(X_train, Y_train)
Y_test_pred = model.predict(X_test)

print("Mse l2, C=", C, ":", mean_squared_error(Y_test, Y_test_pred))
print("Accuracy l2, C=", C, ":", model.score(X_test, Y_test), "\n")

model = svm.LinearSVC(penalty='l2', C=1.0)
_ = model.fit(X_train, Y_train)
Y_test_pred = model.predict(X_test)

print("Mse l2, C=1:", mean_squared_error(Y_test, Y_test_pred))
print("Accuracy l2, C=1: ", model.score(X_test, Y_test))

Mse l2, C= 35 : 0.017543859649122806
Accuracy l2, C= 35 : 0.9824561403508771 

Mse l2, C=1: 0.011695906432748537
Accuracy l2, C=1:  0.9883040935672515


In [14]:
param_grid4 = [
  {'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20]}
 ]

svc = svm.LinearSVC(penalty='l1', dual=False)
grid = GridSearchCV(svc, param_grid4, scoring='accuracy')
grid.fit(X_train, Y_train)
print(grid.best_params_) 

{'C': 5}


In [15]:
C = grid.best_params_['C']
model = svm.LinearSVC(penalty='l1', dual=False, C=1)
_ = model.fit(X_train, Y_train)
Y_test_pred = model.predict(X_test)

print("Mse l1, C=1:", mean_squared_error(Y_test, Y_test_pred))
print("Accuracy l1, C=1: ", model.score(X_test, Y_test), "\n")

model = svm.LinearSVC(penalty='l1', dual=False, C=C)
_ = model.fit(X_train, Y_train)
Y_test_pred = model.predict(X_test)

print("Mse l1, C=", C, ":", mean_squared_error(Y_test, Y_test_pred))
print("Accuracy l1, C=", C, ":", model.score(X_test, Y_test))

Mse l1, C=1: 0.011695906432748537
Accuracy l1, C=1:  0.9883040935672515 

Mse l1, C= 5 : 0.023391812865497075
Accuracy l1, C= 5 : 0.9766081871345029


### Итог
Наилучший результат, который здесь получился: accuracy = 0.9883040935672515, mse = 0.011695906432748537.

## Метод потенциальных функций

In [16]:
class PFM(object):
    def __init__(self, kernel=None):
        self.X, self.Y, self.gams = None, None, None
        self.kernel = lambda x: np.ones_like(x)
        if kernel is not None:
            self.kernel = kernel
            
    def scores(self, X):
        features = self.gams*self.kernel(cdist(X, self.X))
        scores = np.vstack(
            [np.sum(features.T[np.where(self.Y==0)[0]].T, axis=-1),
             np.sum(features.T[np.where(self.Y==1)[0]].T, axis=-1)]).T
        return scores
    
    def predict(self, X):
        return np.argmax(self.scores(X), axis=-1)
    
    def fit(self, X, Y, epoch=10):
        self.X, self.Y, self.gams = np.array(X), np.array(Y), np.zeros_like(Y)
        for _ in range(epoch):
            for i, (x, y) in enumerate(zip(self.X, self.Y)):
                if self.predict(np.array([x]))[0] != y:
                    self.gams[i] += 1

def K(distance, h=0.05):
    ret = np.array(distance)/h
    return (1 - ret**2) * (np.abs(ret) <= 1)

In [17]:
for i, h in enumerate([0.05, 0.5, 1, 2, 3, 4, 5, 6, 7, 8, 8.5, 9, 9.5, 10, 15, 20]):
    model = PFM(kernel=lambda x: K(x, h=h))
    model.fit(X_train, Y_train)
    Y_test_pred = model.predict(X_test)

    print("Mse h=", h, ":", mean_squared_error(Y_test, Y_test_pred))

Mse h= 0.05 : 0.6432748538011696
Mse h= 0.5 : 0.10526315789473684
Mse h= 1 : 0.08187134502923976
Mse h= 2 : 0.1111111111111111
Mse h= 3 : 0.11695906432748537
Mse h= 4 : 0.1286549707602339
Mse h= 5 : 0.26900584795321636
Mse h= 6 : 0.4093567251461988
Mse h= 7 : 0.49707602339181284
Mse h= 8 : 0.017543859649122806
Mse h= 8.5 : 0.017543859649122806
Mse h= 9 : 0.017543859649122806
Mse h= 9.5 : 0.017543859649122806
Mse h= 10 : 0.029239766081871343
Mse h= 15 : 0.029239766081871343
Mse h= 20 : 0.029239766081871343


### Итог
Наилучший результат, который получился для данного метода: mse = 0.0.017543859649122806, при параметре h= 8 -- 9.5.
Стреднеквадратичная ошибка получилась больше чем у svm и perceptron.

## Метод перцептрон

In [19]:
param_grid5 = [
  {'eta0': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0], 'max_iter': [10, 100, 1000, 10000, 10000]}
 ]

model = Perceptron()
grid = GridSearchCV(model, param_grid5, scoring='accuracy')
grid.fit(X_train, Y_train)
print(grid.best_params_) 

{'eta0': 0.1, 'max_iter': 100}


In [27]:
param_grid6 = [
  {'eta0': [0.01, 0.05, 0.07, 0.9, 0.1, 0.11, 0.12], 'max_iter': [10, 15, 20, 50, 100, 150, 200, 500, 700, 900]}
 ]

model = Perceptron()
grid = GridSearchCV(model, param_grid6, scoring='accuracy')
grid.fit(X_train, Y_train)
print(grid.best_params_) 

{'eta0': 0.1, 'max_iter': 15}


In [31]:
eta0 = grid.best_params_['eta0']
max_iter = grid.best_params_['max_iter']

model = Perceptron(eta0=eta0, max_iter=max_iter, penalty="l1")
_ = model.fit(X_train, Y_train)
Y_test_pred = model.predict(X_test)

print("Mse l1:", mean_squared_error(Y_test, Y_test_pred))
print("Accuracy l1: ", model.score(X_test, Y_test), "\n")

model = Perceptron(eta0=eta0, max_iter=max_iter, penalty="l2")
_ = model.fit(X_train, Y_train)
Y_test_pred = model.predict(X_test)

print("Mse l2:", mean_squared_error(Y_test, Y_test_pred))
print("Accuracy l2: ", model.score(X_test, Y_test), "\n")

model = Perceptron(eta0=eta0, max_iter=max_iter, penalty="elasticnet")
_ = model.fit(X_train, Y_train)
Y_test_pred = model.predict(X_test)

print("Mse elasticnet:", mean_squared_error(Y_test, Y_test_pred))
print("Accuracy elasticnet: ", model.score(X_test, Y_test), "\n")

Mse l1: 0.011695906432748537
Accuracy l1:  0.9883040935672515 

Mse l2: 0.04678362573099415
Accuracy l2:  0.9532163742690059 

Mse elasticnet: 0.04678362573099415
Accuracy elasticnet:  0.9532163742690059 



### Итог
Результат при eta0=0.1, max_iter=15, l1: accuracy = 0.9883040935672515, mse = 0.011695906432748537. Итог такой же какой получился методом svm.