# Wyklad 11

## Machine Learning cd.

### Czytanie i Pisanie

### Plików Excela

In [None]:
import os
os.getcwd()

In [None]:
import pandas as pd

df = pd.DataFrame({'Data': [10, 20, 30, 20, 15, 30, 45]})
writer = pd.ExcelWriter('pandas_simple.xlsx', engine='xlsxwriter')

df.to_excel(writer, sheet_name='Sheet1')

writer.save()

In [None]:
import pandas as pd


path = ('Book1.xlsx')
xl = pd.ExcelFile(path)

In [None]:
xl.sheet_names

In [None]:
df1 = xl.parse('Sheet1')

In [None]:
df1

# Support Vector Machines

## Usages

- classification
- regression
- outlier detection

## SVC Flavors in Scikit

- SVC - many kernels to choose from. L2 error metric for regularization (strong penalization of big factors, sensitive to outliers)
- NuSVC - slightly different implementation - worth checking against SVC
- Linear SVC - only linear kernel, more settings, L1 error metric for regularization
- OneClassSVM - good for detecting outliers

## What are they good for

- Effective in high dimensional spaces.
- Still effective in cases where number of dimensions is greater than the number of samples.
- Uses a subset of training points in the decision function (called support vectors), so it is also memory efficient.
- Versatile: different Kernel functions can be specified for the decision function. Common kernels are provided, but it is also possible to specify custom kernels.

## Complexity

Between
\begin{equation}
O(n_{features} \times n_{samples}^2)
\end{equation}
and
\begin{equation}
O(n_{features} \times n_{samples}^3)
\end{equation}

In [None]:
from sklearn import datasets

digits = datasets.load_digits()
digits.data.shape

In [None]:
from sklearn import svm
clf = svm.SVC(C=100., kernel="linear")
X = digits.data[:-1]
y = digits.target[:-1]
clf.fit(X, y)  

In [None]:
clf.get_params()

In [None]:
clf.coef_

shape = [n_class * (n_class-1) / 2, n_features]

In [None]:
clf.coef_.shape

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm

iris = datasets.load_iris()

iris.data.shape, iris.target.shape

In [None]:
%matplotlib notebook

import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets

# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features. We could
                      # avoid this ugly slicing by using a two-dim dataset
y = iris.target

h = .02  # step size in the mesh

# we create an instance of SVM and fit out data. We do not scale our
# data since we want to plot the support vectors
C = 1.0  # SVM regularization parameter
svc = svm.SVC(kernel='linear', C=C).fit(X, y)
rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C).fit(X, y)
poly_svc = svm.SVC(kernel='poly', degree=3, C=C).fit(X, y)
lin_svc = svm.LinearSVC(C=C).fit(X, y)

# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

# title for the plots
titles = ['SVC with linear kernel',
          'LinearSVC (linear kernel)',
          'SVC with RBF kernel',
          'SVC with polynomial (degree 3) kernel']


for i, clf in enumerate((svc, lin_svc, rbf_svc, poly_svc)):
    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, m_max]x[y_min, y_max].
    plt.subplot(2, 2, i + 1)
    plt.subplots_adjust(wspace=0.8, hspace=0.8)

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.6)

    # Plot also the training points
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
    plt.xlabel('Sepal length')
    plt.ylabel('Sepal width')
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.xticks(())
    plt.yticks(())
    plt.title(titles[i])

plt.show()

## Kernels

- linear (use linear SVC if not grid-searching kernel)
- RBF - default - Radial-basis function kernel (aka squared-exponential kernel).
$$ k(x_i, x_j) = exp(\frac{-1}{ 2 d(x_i / lengthscale, x_j / lengthscale)^2}) $$
- poly
- sigmoid
$$ K(X, Y) = tanh( gamma ~\times <X, Y> + coef_0)$$
- precomputed, callable

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

x = np.linspace(-20, 20, 100)
plt.title('Logistic function')
plt.plot(x, np.tanh(x))
print()

## Parameters

- C - Penalty parameter for error
- kernel
- gamma - kernel coefficient for rbf, poly, sigmoid  
  - if gamma='scale' is passed then it uses 1 / (n_features * X.std
- class_weight : {dict, ‘balanced’}

## C - Penalty & Regularization

![title](img\over.png)

## Kernel

![title](img\kernel.png)

## Weighting classes

![title](img\unbalanced.png)

In [None]:
from __future__ import print_function

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

digits = datasets.load_digits()

n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
y = digits.target

# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = ['precision', 'recall', 'f1']

for score in scores:
    print()
    print("# Tuning hyper-parameters for %s" % score)

    clf = GridSearchCV(SVC(), tuned_parameters, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
  


In [None]:
from sklearn.metrics import f1_score, recall_score, precision_score, SCORERS

$$F_1 = \frac{2 \times precision \times recall}{precision + recall} $$

In [None]:
y_true = [0, 1, 0, 1, 0, 1, 0, 1]
y_pred = [0, 1, 0, 0, 0, 0, 1, 1]

precision_score(y_true, y_pred)

In [None]:
recall_score(y_true, y_pred)


In [None]:
f1_score(y_true, y_pred)

In [None]:
SCORERS.keys()