Loïc Herbelot

# TP 2 : SVM

## Question 1) Classification linéaire des données `iris`

In [11]:
# -*- coding: utf-8 -*-
"""
@author: Loïc Herbelot
"""

from sklearn import datasets
from sklearn.svm import SVC
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import numpy as np
from sklearn.model_selection import train_test_split

iris = datasets.load_iris()

"""This data sets consists of 3 different types of irises’ 
(Setosa, Versicolour, and Virginica) petal and sepal length, 
stored in a 150x4 numpy.ndarray
The rows being the samples and the columns being: 
Sepal Length, Sepal Width, Petal Length and Petal Width."""


X = iris.data
y = iris.target

#We only want classes 1 & 2, and consider only the first 2 features.
X = X[y != 0, :2]
y = y[y != 0]

#Shuffling the data:
permutation = np.random.permutation(len(X))
X = X[permutation]
y = y[permutation]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5)

In [12]:
# fit the model with linear kernel
clf_lin = SVC(kernel='linear')
clf_lin.fit(X_train, y_train)

# predict labels for the test data base
y_pred = clf_lin.predict(X_test)

# check your score
score = clf_lin.score(X_test, y_test)
print('Score with linear kernel: %s' % score)

Score with linear kernel: 0.72


## Question 2) Classification polynomiale 

In [13]:
clf_poly = SVC(kernel='poly')
clf_poly.fit(X_train, y_train)

# predict labels for the test data base
y_pred = clf_poly.predict(X_test)

# check your score
score = clf_poly.score(X_test, y_test)
print('Score with polynomial kernel: %s' % score)

Score with polynomial kernel: 0.7


## Question 3) Réécriture du problème primal :

Dans le problème primal, on a les contraintes :

$\xi_i \ge 0$ et $\xi_i \ge 1 - y_i(w \cdot \Phi(x_i) + w_0)$

Ainsi $\xi_i \ge max(0, 1 - y_i(w \cdot \Phi(x_i) + w_0))$

Donc $\xi_i = [1 - y_i(w \cdot \Phi(x_i) + w_0)]_+$

D'où la réécriture du problème primal.

## Question 4) Explication du SVM

Si après avoir trouvé le vecteur $w$, on a une erreur de prédiction sur le point $x_i$, alors la marge qui vaut $marge_i = y_i(w \cdot \Phi(x_i) + w_0)$ est négative, ainsi $\xi_i = 1 - marge_i \ge 1$.

Sinon, si la prédiction est correcte, la marge est positive et $\xi_i = 0$.

Ainsi il nous faudrait une fonction qui représente l'erreur de classification, qui vaut au moins 1 quand la marge est négative (cas d'erreur), et 0 quand la marge est positive (prédiction correcte).

Pour que les calculs soient plus pratiques, cette fonction est convexe.

La fonction charnière (*hinge*) correspond aux caractéristiques voulues, et le SVM tente de minimiser l'image de cette fonction.

## SVM GUI avec des classes déséquilibrées :

### Avec $C=1$
![title](svm_gui1.png)

### Avec $C=0.001$
![title](svm_gui001.png)
### Avec $C=0.00001$
![title](svm_gui00001.png)

On voit que lorsque le paramètre de régularisation $C$ devient très fabible, la classe la plus représentée "écrase" l'autre classe.

## Question 5) Classification de visage, influence du paramètre de régularisation

In [58]:
from sklearn.svm import SVC
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from os import listdir
import cv2

# to change if the image folder is elsewhere:
faces_path = "lfw/"

available_names = listdir(faces_path)
print("Available names: %s" % available_names)

Available names: ['George W Bush', 'Hugo Chavez', 'Donald Rumsfeld', 'Gerhard Schroeder', 'Ariel Sharon', 'Colin Powell', 'Tony Blair']


In [78]:
names = [available_names[0], available_names[2]]

# Selecting all the images we can
images0 = [faces_path + names[0] + "/" + f for f in listdir(faces_path + names[0])]
images1 = [faces_path + names[1] + "/" + f for f in listdir(faces_path + names[1])]
# Selecting the same number of images for each person:
min_files = min(len(images0), len(images1))
images0 = images0[:min_files]
images1 = images1[:min_files]

filenames = np.array(images0 + images1)
y = np.array(min_files * [-1] + min_files * [+1])
X = []
for f in filenames:
    img = cv2.imread(f, cv2.IMREAD_GRAYSCALE) #matrix of grey values
    X.append(img.reshape(len(img)*len(img[0])))
X = np.array(X)

# Shuffling the data:
permutation = np.random.permutation(2*min_files)
filenames = filenames[permutation]
y = y[permutation]
X = X[permutation]

X_train, X_test, y_train, y_test, filenames_train, filenames_test = train_test_split(
    X, y, filenames, test_size=0.25)

clf2 = GridSearchCV(SVC(C=1), {'C':np.logspace(-5, 5, num=10), 'kernel':['linear']})
clf2.fit(X_train, y_train)



GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'kernel': ['linear'], 'C': array([  1.00000e-05,   1.29155e-04,   1.66810e-03,   2.15443e-02,
         2.78256e-01,   3.59381e+00,   4.64159e+01,   5.99484e+02,
         7.74264e+03,   1.00000e+05])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [80]:
for k,v in clf.cv_results_.iteritems():
    print("%s: %s\n" % (k,v))
print clf.cv_results_['mean_test_score']
print clf.best_params_

std_train_score: [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]

rank_test_score: [1 1 1 1 1 1 1 1 1 1]

mean_score_time: [ 0.07373754  0.07249475  0.07292231  0.07512967  0.07411075  0.0729243
  0.07220523  0.07329901  0.07298795  0.07207545]

std_test_score: [ 0.01900312  0.01900312  0.01900312  0.01900312  0.01900312  0.01900312
  0.01900312  0.01900312  0.01900312  0.01900312]

split1_train_score: [ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]

split0_test_score: [ 0.68852459  0.68852459  0.68852459  0.68852459  0.68852459  0.68852459
  0.68852459  0.68852459  0.68852459  0.68852459]

mean_test_score: [ 0.70718232  0.70718232  0.70718232  0.70718232  0.70718232  0.70718232
  0.70718232  0.70718232  0.70718232  0.70718232]

param_C: [1.0000000000000001e-05 0.00012915496650148841 0.0016681005372000592
 0.021544346900318846 0.27825594022071259 3.5938136638046259
 46.415888336127821 599.48425031894214 7742.6368268112774 100000.0]

split2_train_score: [ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]

sp