# Algoritmo Naive Bayes 

## Imports

In [2]:
# importamos las librerias y bibliotecas que utilizaremos para el algoritmo naive bayes
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import matplotlib.gridspec as gridspec
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from pandas import DataFrame
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import f1_score
import numpy as np

## Funciones auxiliares

In [3]:
# Construcción de una función que realice el particionado completo
def train_val_test_split(df, rstate=42, shuffle=True, stratify=None):
    strat = df[stratify] if stratify else None
    train_set, test_set = train_test_split(
        df, test_size=0.4, random_state=rstate, shuffle=shuffle, stratify=strat)
    strat = test_set[stratify] if stratify else None
    val_set, test_set = train_test_split(
        test_set, test_size=0.5, random_state=rstate, shuffle=shuffle, stratify=strat)
    return (train_set, val_set, test_set)

In [25]:
#Función para representar el límite de desición
from matplotlib import pyplot as plt
from matplotlib import colors
import matplotlib.gridspec as gridspec

def plot_decision_boundary(X, y, clf):
    mins = X.min(axis=0) - 0.5
    maxs = X.max(axis=0) + 0.5  

    xx, yy = np.meshgrid(np.linspace(mins[0], maxs[0], 71),
                     np.linspace(mins[1], maxs[1], 81))
    Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])
    Z = Z[:, 1].reshape(xx.shape)
    
    gs = gridspec.GridSpec(1, 2)
    gs.update(hspace=0.8)
    
    ax = plt.subplot(gs[0])
    ax.contourf(xx, yy, Z, cmap="RdBu", alpha=0.5)
    ax.plot(X[:, 0][y==1], X[:, 1][y==1], 'r.')
    ax.plot(X[:, 0][y==0], X[:, 1][y==0], 'b.')
    
    ax = plt.subplot(gs[1])
    ax.contour(xx, yy, Z, [0.5], colors='k')
    ax.plot(X[:, 0][y==1], X[:, 1][y==1], 'r.')
    ax.plot(X[:, 0][y==0], X[:, 1][y==0], 'b.')

## Lectura del conjunto de datos

In [5]:
df = pd.read_csv('dataset.csv')

##  Visualización preliminar de la información

In [6]:
# Vista de los datos cargados
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,30669,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
1,30468,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,16523,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,56543,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,46136,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0
...,...,...,...,...,...,...,...,...,...,...,...,...
43395,56196,Female,10.0,0,0,No,children,Urban,58.64,20.4,never smoked,0
43396,5450,Female,56.0,0,0,Yes,Govt_job,Urban,213.61,55.4,formerly smoked,0
43397,28375,Female,82.0,1,0,Yes,Private,Urban,91.94,28.9,formerly smoked,0
43398,27973,Male,40.0,0,0,Yes,Private,Urban,99.16,33.2,never smoked,0


In [7]:
#Validamos la cantidad de ejemplos que posee el dataset
df["stroke"].value_counts()

0    42617
1      783
Name: stroke, dtype: int64

## Primera preparación del conjunto de datos

In [8]:
#Función para transformar las columnas categoricas a numericas
df_new = pd.get_dummies(df, columns=['gender','ever_married','work_type','Residence_type','smoking_status'])

In [9]:
#Validamos que los cambios se aplicarón correctamente
df_new

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,...,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,30669,3.0,0,0,95.12,18.0,0,0,1,0,...,0,0,0,0,1,1,0,0,0,0
1,30468,58.0,1,0,87.96,39.2,0,0,1,0,...,0,0,1,0,0,0,1,0,1,0
2,16523,8.0,0,0,110.89,17.6,0,1,0,0,...,0,0,1,0,0,0,1,0,0,0
3,56543,70.0,0,0,69.04,35.9,0,1,0,0,...,0,0,1,0,0,1,0,1,0,0
4,46136,14.0,0,0,161.28,19.1,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43395,56196,10.0,0,0,58.64,20.4,0,1,0,0,...,0,0,0,0,1,0,1,0,1,0
43396,5450,56.0,0,0,213.61,55.4,0,1,0,0,...,1,0,0,0,0,0,1,1,0,0
43397,28375,82.0,1,0,91.94,28.9,0,1,0,0,...,0,0,1,0,0,0,1,1,0,0
43398,27973,40.0,0,0,99.16,33.2,0,0,1,0,...,0,0,1,0,0,0,1,0,1,0


In [10]:
#Eliminamos datos que son irrelevantes y que podrian alterar el comportamiento del algoritmo
df_new = df_new.drop(["id"], axis=1)

## División del conjunto de datos

In [11]:
# Division del conjunto en los diferentes subconjuntos
train_set, val_set, test_set = train_val_test_split(df_new, stratify='stroke')

In [12]:
#Verificamos las particiones realizadas
print("Longitud del Training Set:", len(train_set))
print("Longitud del Validation Set:", len(val_set))
print("Longitud del Test Set:", len(test_set))

Longitud del Training Set: 26040
Longitud del Validation Set: 8680
Longitud del Test Set: 8680


In [13]:
# Conjunto de datos general
X_df = df_new.drop("stroke", axis=1)
y_df = df_new["stroke"].copy()

In [14]:
# Conjunto de datos de entrenamiento
X_train = train_set.drop("stroke", axis=1)
y_train = train_set["stroke"].copy()

In [15]:
# Conjunto de datos de validación
X_val = val_set.drop("stroke", axis=1)
y_val = val_set["stroke"].copy()

In [16]:
# Conjunto de datos de pruebas
X_test = test_set.drop("stroke", axis=1)
y_test = test_set["stroke"].copy()

## Segunda preparación del conjunto de datos

In [17]:
# Construcción de un pipeline para los atributos numéricos
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('rbst_scaler', RobustScaler()),
    ])

In [18]:
# Rellenamos los valores nulos con la mediana y scalamos los datos
df_prep = num_pipeline.fit_transform(X_df)
X_train_prep = num_pipeline.fit_transform(X_train)
X_val_prep = num_pipeline.fit_transform(X_val)
X_test_prep = num_pipeline.fit_transform(X_test)

In [19]:
# Transformamos el resultado a un DataFrame de Pandas
df_prep = pd.DataFrame(df_prep, columns=X_df.columns, index=y_df.index)
X_train_prep = pd.DataFrame(X_train_prep, columns=X_train.columns, index=y_train.index)
X_val_prep = pd.DataFrame(X_val_prep, columns=X_val.columns, index=y_val.index)
X_test_prep = pd.DataFrame(X_test_prep, columns=X_test.columns, index=y_test.index)

In [20]:
#Validamos que los cambios se aplicaron correctamente 
df_prep

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,-1.138889,0.0,0.0,0.102520,-1.054348,-1.0,1.0,0.0,1.0,-1.0,0.0,0.0,-1.0,0.0,1.0,1.0,-1.0,0.0,0.0,0.0
1,0.388889,1.0,0.0,-0.104836,1.250000,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-1.000000,0.0,0.0,0.559224,-1.097826,0.0,0.0,0.0,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.722222,0.0,0.0,-0.652766,0.891304,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,1.0,0.0,0.0
4,-0.833333,0.0,0.0,2.018535,-0.934783,-1.0,1.0,0.0,1.0,-1.0,0.0,1.0,-1.0,0.0,0.0,1.0,-1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43395,-0.944444,0.0,0.0,-0.953953,-0.793478,0.0,0.0,0.0,1.0,-1.0,0.0,0.0,-1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
43396,0.333333,0.0,0.0,3.534028,3.010870,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
43397,1.055556,1.0,0.0,0.010426,0.130435,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
43398,-0.111111,0.0,0.0,0.219519,0.597826,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## Naive Bayes con un conjunto de datos multidimensional

In [47]:
from sklearn.naive_bayes import BernoulliNB

nb_clf = BernoulliNB(alpha=1.0e-10)
nb_clf.fit(X_train_prep, y_train)

BernoulliNB(alpha=1e-10)

In [48]:
# Predecimos para el conjunto de validación 
y_pred = nb_clf.predict(X_val_prep)

In [49]:
print('F1 score: {:.3f}'.format(f1_score(y_val, y_pred, pos_label=0)))

F1 score: 0.879


In [51]:
from sklearn.metrics import accuracy_score
print('Accuracy: {:.3f}'.format(accuracy_score(y_val, y_pred)))

Accuracy: 0.787


In [62]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_val, y_pred)

array([[6753, 1770],
       [ 124,   33]], dtype=int64)

In [52]:
# Predecimos para el conjunto de pruebas
y_pred = nb_clf.predict(X_test_prep)

In [53]:
print('F1 score: {:.3f}'.format(f1_score(y_test, y_pred, pos_label=0)))

F1 score: 0.882


In [54]:
print('Accuracy: {:.3f}'.format(accuracy_score(y_test, y_pred)))

Accuracy: 0.791


In [63]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_val, y_pred)

array([[6753, 1770],
       [ 124,   33]], dtype=int64)

## Gaussian Naive Bayescon un conjunto de datos multidimensional

In [55]:
# Entrenamos un Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
clf.fit(X_train_prep, y_train)

GaussianNB()

In [56]:
# Predecimos para el conjunto de validación 
y_pred = clf.predict(X_val_prep)

In [57]:
print('F1 score: {:.3f}'.format(f1_score(y_val, y_pred, pos_label=0)))

F1 score: 0.439


In [58]:
print('Accuracy: {:.3f}'.format(accuracy_score(y_val, y_pred)))

Accuracy: 0.294


In [64]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_val, y_pred)

array([[6753, 1770],
       [ 124,   33]], dtype=int64)

In [59]:
# Predecimos para el conjunto de pruebas
y_pred = nb_clf.predict(X_test_prep)

In [60]:
print('F1 score: {:.3f}'.format(f1_score(y_test, y_pred, pos_label=0)))

F1 score: 0.882


In [61]:
print('Accuracy: {:.3f}'.format(accuracy_score(y_test, y_pred)))

Accuracy: 0.791


In [65]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_val, y_pred)

array([[6753, 1770],
       [ 124,   33]], dtype=int64)