# Algoritmo support vector machine

## Imports

In [1]:
# importamos las librerias y bibliotecas que utilizaremos para el algoritmo de support vector machine
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin

## Funciones auxiliares

In [2]:
# Construcción de una función que realice el particionado completo
def train_val_test_split(df, rstate=42, shuffle=True, stratify=None):
    strat = df[stratify] if stratify else None
    train_set, test_set = train_test_split(
        df, test_size=0.4, random_state=rstate, shuffle=shuffle, stratify=strat)
    strat = test_set[stratify] if stratify else None
    val_set, test_set = train_test_split(
        test_set, test_size=0.5, random_state=rstate, shuffle=shuffle, stratify=strat)
    return (train_set, val_set, test_set)

In [3]:
# Representación gráfica del límite de decisión
def plot_svc_decision_boundary(svm_clf, xmin, xmax):
    w = svm_clf.coef_[0]
    b = svm_clf.intercept_[0]

    # At the decision boundary, w0*x0 + w1*x1 + b = 0
    # => x1 = -w0/w1 * x0 - b/w1
    x0 = np.linspace(xmin, xmax, 200)
    decision_boundary = -w[0]/w[1] * x0 - b/w[1]

    margin = 1/w[1]
    gutter_up = decision_boundary + margin
    gutter_down = decision_boundary - margin

    svs = svm_clf.support_vectors_
    plt.scatter(svs[:, 0], svs[:, 1], s=180, facecolors='#FFAAAA')
    plt.plot(x0, decision_boundary, "k-", linewidth=2)
    plt.plot(x0, gutter_up, "k--", linewidth=2)
    plt.plot(x0, gutter_down, "k--", linewidth=2)

## Lectura del conjunto de datos

In [7]:
#Lectura del dataset 
df = pd.read_csv('dataset-equilibrado-1.csv')

## Visualización preliminar de la información

In [8]:
# Vista de los datos cargados
df.head(5)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [9]:
#Validamos la cantidad de ejemplos que posee el dataset
df["stroke"].value_counts()

0    4861
1     783
Name: stroke, dtype: int64

## Primera preparación del conjunto de datos

In [None]:
#Función para transformar las columnas categoricas a numericas
df_new = pd.get_dummies(df, columns=['gender','ever_married','work_type','Residence_type','smoking_status'])

In [None]:
#Validamos que los cambios se aplicarón correctamente 
df_new

In [None]:
#Eliminamos datos que son irrelevantes y que podrian alterar el comportamiento del algoritmo
df_new = df_new.drop(["id"], axis=1)

In [None]:
# Representación gráfica de dos características
plt.figure(figsize=(12, 6))
plt.scatter(df_new["age"][df_new['stroke'] == 1], df_new["avg_glucose_level"][df_new['stroke'] == 1], c="r", marker=".")
plt.scatter(df_new["age"][df_new['stroke'] == 0], df_new["avg_glucose_level"][df_new['stroke'] == 0], c="y", marker="x")
plt.xlabel("age", fontsize=13)
plt.ylabel("avg_glucose_level", fontsize=13)
plt.show()

## División del conjunto de datos

In [None]:
# Division del conjunto en los diferentes subconjuntos
train_set, val_set, test_set = train_val_test_split(df_new, stratify='stroke')

In [None]:
#Verificamos las particiones realizadas
print("Longitud del Training Set:", len(train_set))
print("Longitud del Validation Set:", len(val_set))
print("Longitud del Test Set:", len(test_set))

In [None]:
# Conjunto de datos de entrenamiento
X_train = train_set.drop("stroke", axis=1)
y_train = train_set["stroke"].copy()

In [None]:
# Conjunto de datos de validación
X_val = val_set.drop("stroke", axis=1)
y_val = val_set["stroke"].copy()

In [None]:
# Conjunto de datos de pruebas
X_test = test_set.drop("stroke", axis=1)
y_test = test_set["stroke"].copy()

## Segunda preparación del conjunto de datos

In [None]:
# Construcción de un pipeline para rellenar los valores nulos con la mediana y scalar los datos 
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('rbst_scaler', RobustScaler()),
    ])

In [None]:
# Rellenamos los valores nulos con la mediana y scalamos los datos
X_train_prep = num_pipeline.fit_transform(X_train)
X_val_prep = num_pipeline.fit_transform(X_val)
X_test_prep = num_pipeline.fit_transform(X_test)

In [None]:
# Transformamos el resultado a un DataFrame de Pandas
X_train_prep = pd.DataFrame(X_train_prep, columns=X_train.columns, index=y_train.index)
X_val_prep = pd.DataFrame(X_val_prep, columns=X_val.columns, index=y_val.index)
X_test_prep = pd.DataFrame(X_test_prep, columns=X_test.columns, index=y_test.index)

In [None]:
#Validamos que los cambios se aplicaron correctamente 
X_train_prep

## SMV: Kernel lineal

### Conjunto de datos reducido

**Entrenamiento del algoritmo con un conjunto de datos reducido**

In [None]:
# Reducimos el conjunto de datos para representarlo gráficamente
X_train_reduced = X_train_prep[["age", "avg_glucose_level"]].copy()
X_val_reduced = X_val_prep[["age", "avg_glucose_level"]].copy()

In [None]:
X_train_reduced

In [None]:
from sklearn.svm import SVC

# SVM Large Margin Classification
svm_clf = SVC(kernel="linear", C=50)
svm_clf.fit(X_train_reduced, y_train)

**Representación del límite de decisión**

In [None]:
def plot_svc_decision_boundary(svm_clf, xmin, xmax):
    w = svm_clf.coef_[0]
    b = svm_clf.intercept_[0]

    x0 = np.linspace(xmin, xmax, 200)
    decision_boundary = -w[0]/w[1] * x0 - b/w[1]

    margin = 1/w[1]
    gutter_up = decision_boundary + margin
    gutter_down = decision_boundary - margin

    svs = svm_clf.support_vectors_
    plt.scatter(svs[:, 0], svs[:, 1], s=180, facecolors='#FFAAAA')
    plt.plot(x0, decision_boundary, "k-", linewidth=2)
    plt.plot(x0, gutter_up, "k--", linewidth=2)
    plt.plot(x0, gutter_down, "k--", linewidth=2)

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(X_train_reduced.values[:, 0][y_train==1], X_train_reduced.values[:, 1][y_train==1], "g^")
plt.plot(X_train_reduced.values[:, 0][y_train==0], X_train_reduced.values[:, 1][y_train==0], "bs")
plot_svc_decision_boundary(svm_clf, 0, 1)
plt.title("$C = {}$".format(svm_clf.C), fontsize=16)
plt.axis([0, 1, -100, 250])
plt.xlabel("age", fontsize=13)
plt.ylabel("avg_glucose_level", fontsize=13)
plt.show()

**Predicción con un conjunto de datos reducido**

In [None]:
y_pred = svm_clf.predict(X_val_reduced)

In [None]:
print("F1 Score:", f1_score(y_pred, y_val, pos_label=0))

In [None]:
from sklearn.metrics import accuracy_score
print('Accuracy: {:.3f}'.format(accuracy_score(y_val,y_pred)))

### Conjunto de datos completo

In [None]:
# Entrenamiento con todo el conjunto de datos
from sklearn.svm import SVC

svm_clf = SVC(kernel="linear", C=1)
svm_clf.fit(X_train_prep, y_train)

In [None]:
y_pred = svm_clf.predict(X_val_prep)

In [None]:
print("F1 Score:", f1_score(y_pred, y_val, pos_label=0))

In [None]:
print('Accuracy: {:.3f}'.format(accuracy_score(y_val,y_pred)))

## SMV: Kernel no lineal

### Polynomial Kernel (I)

**Entrenamiento del algoritmo con un conjunto de datos reducido**

In [None]:
from sklearn.datasets import make_moons
from sklearn.svm import LinearSVC
from sklearn.preprocessing import PolynomialFeatures

polynomial_svm_clf = Pipeline([
        ("poly_features", PolynomialFeatures(degree=3)),
        ("scaler", StandardScaler()),
        ("svm_clf", LinearSVC(C=20, loss="hinge", random_state=42, max_iter=100000))
    ])

polynomial_svm_clf.fit(X_train_reduced, y_train)

In [None]:
def plot_dataset(X, y):
    plt.plot(X[:, 0][y==1], X[:, 1][y==1], "b.")
    plt.plot(X[:, 0][y==0], X[:, 1][y==0], "g.")

In [None]:
def plot_predictions(clf, axes):
    x0s = np.linspace(axes[0], axes[1], 100)
    x1s = np.linspace(axes[2], axes[3], 100)
    x0, x1 = np.meshgrid(x0s, x1s)
    X = np.c_[x0.ravel(), x1.ravel()]
    y_pred = clf.predict(X).reshape(x0.shape)
    y_decision = clf.decision_function(X).reshape(x0.shape)
    plt.contourf(x0, x1, y_pred, cmap=plt.cm.brg, alpha=0.2)
    plt.contourf(x0, x1, y_decision, cmap=plt.cm.brg, alpha=0.1)

fig, axes = plt.subplots(ncols=2, figsize=(15,5), sharey=True)
plt.sca(axes[0])
plot_dataset(X_train_reduced.values, y_train)
plot_predictions(polynomial_svm_clf, [0, 1, -100, 250])
plt.xlabel("avg_glucose_level", fontsize=11)
plt.ylabel("age", fontsize=11)
plt.sca(axes[1])
plot_predictions(polynomial_svm_clf, [0, 1, -100, 250])
plt.xlabel("avg_glucose_level", fontsize=11)
plt.ylabel("age", fontsize=11)
plt.show()

**Predicción con el conjunto de datos reducido**

In [None]:
y_pred = polynomial_svm_clf.predict(X_val_reduced)

In [None]:
print("F1 Score:", f1_score(y_pred, y_val, pos_label=0))

In [None]:
print('Accuracy: {:.3f}'.format(accuracy_score(y_val,y_pred)))

### Polynomial Kernel (II)

In [None]:
svm_clf = SVC(kernel="poly", degree=3, coef0=10, C=20)
svm_clf.fit(X_train_reduced, y_train)

**Representación del límite de decisión**

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(15,5), sharey=True)
plt.sca(axes[0])
plot_dataset(X_train_reduced.values, y_train)
plot_predictions(svm_clf, [0, 1, -100, 250])
plt.xlabel("avg_glucose_level", fontsize=11)
plt.ylabel("age", fontsize=11)
plt.sca(axes[1])
plot_predictions(svm_clf, [0, 1, -100, 250])
plt.xlabel("avg_glucose_level", fontsize=11)
plt.ylabel("age", fontsize=11)
plt.show()

**Predicción con un conjunto de datos reducido**

In [None]:
y_pred = svm_clf.predict(X_val_reduced)

In [None]:
print("F1 Score:", f1_score(y_pred, y_val, pos_label=0))

In [None]:
print('Accuracy: {:.3f}'.format(accuracy_score(y_val,y_pred)))

**Predicción con el conjunto de datos completo**

In [None]:
svm_clf = SVC(kernel="poly", degree=3, coef0=10, C=40)
svm_clf.fit(X_train_prep, y_train)

In [None]:
y_pred = svm_clf.predict(X_val_prep)

In [None]:
print("F1 Score:", f1_score(y_pred, y_val, pos_label=0))

In [None]:
print('Accuracy: {:.3f}'.format(accuracy_score(y_val,y_pred)))

### Gaussian Kernel

**Entrenamiento del algoritmo con un conjunto de datos reducido**

In [None]:
rbf_kernel_svm_clf = Pipeline([
            ("scaler", RobustScaler()),
            ("svm_clf", SVC(kernel="rbf", gamma=0.5, C=1000))
        ])

rbf_kernel_svm_clf.fit(X_train_reduced, y_train)

**Representación del límite de decisión**

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(15,5), sharey=True)
plt.sca(axes[0])
plot_dataset(X_train_reduced.values, y_train)
plot_predictions(rbf_kernel_svm_clf, [0, 1, -100, 250])
plt.xlabel("avg_glucose_level", fontsize=11)
plt.ylabel("age", fontsize=11)
plt.sca(axes[1])
plot_predictions(rbf_kernel_svm_clf, [0, 1, -100, 250])
plt.xlabel("avg_glucose_level", fontsize=11)
plt.ylabel("age", fontsize=11)
plt.show()

**Predicción con un conjunto de datos reducido**

In [None]:
y_pred = rbf_kernel_svm_clf.predict(X_val_reduced)

In [None]:
print("F1 Score:", f1_score(y_pred, y_val, pos_label=0))

In [None]:
print('Accuracy: {:.3f}'.format(accuracy_score(y_val,y_pred)))

**Predicción con un conjunto de datos completo**

In [None]:
rbf_kernel_svm_clf = Pipeline([
            ("scaler", RobustScaler()),
            ("svm_clf", SVC(kernel="rbf", gamma=0.05, C=1000))
        ])

rbf_kernel_svm_clf.fit(X_train_prep, y_train)

In [None]:
y_pred = rbf_kernel_svm_clf.predict(X_val_prep)

In [None]:
print("F1 Score:", f1_score(y_pred, y_val, pos_label=0))

In [None]:
print('Accuracy: {:.3f}'.format(accuracy_score(y_val,y_pred)))