# Scikit Learn - SVM

- - -

In [None]:
import pandas as pd
import warnings
warnings.filterwarnings(action="ignore")

- - -

In [None]:
wine = pd.read_csv("https://raw.githubusercontent.com/4data-lab/datasets/master/redwine.csv")
wine.head()

In [None]:
wine = pd.read_csv("https://raw.githubusercontent.com/4data-lab/datasets/master/redwine.csv", sep=";")
wine.head()

In [None]:
wine.shape

- - -
*Contenido del dataset*:

0. fixed acidity
1. volatile acidity
2. citric acid
3. residual sugar
4. chlorides
5. free sulfur dioxide
6. total sulfur dioxide
7. density
8. pH
9. sulphates
10. alcohol


11. quality (valoración entre 0 y 10)
- - -

In [None]:
#¿Hay duplicados en el dataset?
print("Duplicados: " + str(wine.duplicated().sum()))
#Esto es porque diferentes catadores de vino dieron la misma nota a vinos similares.

In [None]:
X = wine.drop(["quality"], axis=1)

In [None]:
y = wine["quality"]

In [None]:
X.head()

In [None]:
y.value_counts()

- - -

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.30, random_state=7)

- - -

In [None]:
from sklearn.svm import SVC

In [None]:
#clasificador one-vs-one
SVM = SVC(kernel = 'rbf')

In [None]:
SVM.fit(X_train,y_train)
y_train_pred = SVM.predict(X_train)
y_test_pred = SVM.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
print("Accuracy - Datos de test")
accuracy_score(y_test, y_test_pred)

In [None]:
#¿Y si escalamos los datos?
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
sc=RobustScaler()
data=sc.fit_transform(X)

In [None]:
#Por defecto, clasificamos de modo one-vs-one.
X_train, X_test, y_train, y_test = train_test_split(data, y, stratify=y, test_size=0.30, random_state=8)
SVM.fit(X_train,y_train)
y_train_pred = SVM.predict(X_train)
y_test_pred = SVM.predict(X_test)
print("Accuracy - Datos de test")
accuracy_score(y_test, y_test_pred)

In [None]:
#clasificador one-vs-all
from sklearn.multiclass import OneVsRestClassifier
clf = OneVsRestClassifier(SVC()).fit(X_train, y_train)
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)
print("Accuracy - Datos de test")
accuracy_score(y_test, y_test_pred)

- - -

In [None]:
#Probaremos con diferentes random state... (1, 2, 3, 4, 5)

- - -

### Cross Validation

RECORDAD: en función de cómo se ha realizado la separación de datos de test y train, el resultado de nuestra métrica puede variar ya que el contenido de train y test varía.

Para hacer más consistente nuestros resultados, DEBEMOS realizar distintas separaciones de los datos y evaluarlas.



<img src=https://i.ibb.co/5rvpntt/06.png width="700">

- - -

<img src=https://i.ibb.co/J3scnkf/07.jpg width="700">

- - -

Ventajas:
* Nos permite tener una métrica de estimación del error mucho más estable.  
* Hace un uso más eficiente de los datos, ya que se usan todas las observaciones tanto en train como en test.

Desventajas:
* Es un concepto más complejo que la separación simple de train_test_split.
* Tardará K veces más de tiempo hacer los cálculos.
- - -

### Validación

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
cross_val_score(SVM, X_train, y_train, cv=5, scoring="accuracy")

In [None]:
cross_val_score(SVM, X_train, y_train, cv=5, scoring="accuracy").mean()

In [None]:
print("Accuracy - Datos de validation")
cross_val_score(SVM, X_train, y_train, cv=5, scoring="accuracy").mean()

In [None]:
print("Accuracy - Datos de test")
accuracy_score(y_test, y_test_pred)

#Pipeline

In [None]:
#Todo lo que hemos hecho, lo podríamos haber realizado con un pipeline.
from sklearn.pipeline import Pipeline

In [None]:
pipe = Pipeline([('scaler', StandardScaler()), ('svc', SVC())])

In [None]:
pipe.fit(X_train, y_train)
Pipeline(steps=[('scaler', StandardScaler()), ('svc', SVC())])
pipe.score(X_test, y_test)

In [None]:
#Incluso podemos hacer un pipeline con distintos clasificadores
from sklearn.neighbors import KNeighborsClassifier
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf"),
    SVC(kernel="linear"),
    SVC(kernel="poly")
    ]
for classifier in classifiers:
    steps = [('scaler', StandardScaler()), ('clf', classifier)]
    pipeline = Pipeline(steps)
    pipeline.fit(X_train, y_train)
    print(classifier)
    print("model score: %.3f" % pipeline.score(X_test, y_test))

In [None]:
#O varios clasificadores y escalados
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf"),
    SVC(kernel="linear"),
    SVC(kernel="poly")
    ]
scalers = [StandardScaler(),
          MinMaxScaler(),
          RobustScaler()]

for classifier in classifiers:
  for scaler in scalers:
    steps = [('scaler', scaler), ('clf', classifier)]
    pipeline = Pipeline(steps)
    pipeline.fit(X_train, y_train)
    print(classifier)
    print("model score: %.3f" % pipeline.score(X_test, y_test))
    scores = cross_val_score(pipeline, X_train, y_train, cv = 5)
    print("cv score:" + str(scores))
    print('---')

- - -