# Lab 8 Clasificador Naive Bayes

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.naive_bayes import GaussianNB

## Metodos de validacion

### Holdout

In [2]:
def holdout_validation(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,stratify=y,random_state=42)
    return X_train, X_test, y_train, y_test

### Fold Cross-Validation

In [3]:
def k_fold_cross_validation(X, y, K=10):
    X = X.apply(pd.to_numeric, errors='coerce')
    y = pd.to_numeric(y, errors='coerce')
    
    X = X.fillna(X.mean())
    
    skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=42)
    folds = []

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index].values, X.iloc[test_index].values
        y_train, y_test = y.iloc[train_index].values, y.iloc[test_index].values
        
        folds.append((X_train, X_test, y_train, y_test))
    
    return folds

### Leave-One-Out

In [4]:
def leave_one_out_validation(X, y):
    X = X.apply(pd.to_numeric, errors='coerce')
    X = X.fillna(X.mean())
    
    loo = LeaveOneOut()
    leaveVali = []
    
    for train_index, test_index in loo.split(X):
        # Tomar los datos de entrenamiento y prueba como matrices numpy
        X_train, X_test = X.iloc[train_index].values, X.iloc[test_index].values
        y_train, y_test = y.iloc[train_index].values, y.iloc[test_index].values
        
        leaveVali.append((X_train, X_test, y_train, y_test))

    return leaveVali

## Clasificador Naive Bayes

In [5]:
model = GaussianNB()
iris = pd.read_csv('iris_clean.csv')
diabetes = pd.read_csv('dataset/diabetes.csv')
parkinsons = pd.read_csv('dataset/parkinsons_disease_data.csv')

## Dataset iris

In [6]:
irisX = iris.drop(columns=['class','class_encoded'], axis = 1)
irisY = iris['class_encoded']

# Validación Holdout
X_train, X_test, y_train, y_test = holdout_validation(irisX,irisY)
model.fit(X_train, y_train)
yPredict = model.predict(X_test)

conf_matrix = confusion_matrix(y_test, yPredict)
accuracy = accuracy_score(y_test, yPredict)

print("    Hold-out")
print("Matriz de Confusión:")
print(conf_matrix)
print("\nPrecisión", accuracy)

# Validación K-Fold
folds = k_fold_cross_validation(irisX, irisY)
conf_matrix_kfold = np.zeros((len(set(y_train)), len(set(y_train))))
accuracy_kfold = []

for X_train, X_test, Y_train, Y_test in folds:
    model.fit(X_train, Y_train)
    y_pred_fold = model.predict(X_test)
    conf_matrix_kfold += confusion_matrix(Y_test, y_pred_fold)
    accuracy_kfold.append(accuracy_score(Y_test, y_pred_fold))
    
average_accuracy_kfold = np.mean(accuracy_kfold)

print("\n    K-Fold")
print("Matriz de Confusión:")
print(conf_matrix_kfold)
print("\nPrecisión:", average_accuracy_kfold)


# Validación leave one out
num_clases = len(np.unique(irisY))
conf_matrix_leave = np.zeros((num_clases, num_clases))

accuracy_leave = []
foldsOne = leave_one_out_validation(irisX,irisY)
for X_train, X_test, Y_train, Y_test in foldsOne:
    model.fit(X_train, Y_train)
    y_pred_fold = model.predict(X_test)
    conf_matrix_leave += confusion_matrix(Y_test, y_pred_fold,labels=range(num_clases))
    accuracy_leave.append(accuracy_score(Y_test, y_pred_fold))

# Calcular la precisión promedio
average_accuracy_leave = np.mean(accuracy_leave)

# Mostrar resultados
print("\n    Leave-One-Out")
print("Matriz de Confusión acumulada:")
print(conf_matrix_leave)
print("\nPrecisión promedio:", average_accuracy_leave)

    Hold-out
Matriz de Confusión:
[[15  0  0]
 [ 0 14  1]
 [ 0  3 12]]

Precisión 0.9111111111111111

    K-Fold
Matriz de Confusión:
[[50.  0.  0.]
 [ 0. 47.  3.]
 [ 0.  4. 46.]]

Precisión: 0.9533333333333334

    Leave-One-Out
Matriz de Confusión acumulada:
[[50.  0.  0.]
 [ 0. 47.  3.]
 [ 0.  4. 46.]]

Precisión promedio: 0.9533333333333334


## Dataset diabetes

In [7]:
diabetesX = diabetes.drop(columns=['Outcome'], axis = 1)
diabetesY = diabetes['Outcome']

# Validación Holdout
X_train, X_test, y_train, y_test = holdout_validation(diabetesX,diabetesY)
model.fit(X_train, y_train)
yPredict = model.predict(X_test)

conf_matrix = confusion_matrix(y_test, yPredict)
accuracy = accuracy_score(y_test, yPredict)

print("    Hold-out")
print("Matriz de Confusión:")
print(conf_matrix)
print("\nPrecisión", accuracy)

# Validación K-Fold
folds = k_fold_cross_validation(diabetesX, diabetesY)
conf_matrix_kfold = np.zeros((len(set(y_train)), len(set(y_train))))
accuracy_kfold = []

for X_train, X_test, Y_train, Y_test in folds:
    model.fit(X_train, Y_train)
    y_pred_fold = model.predict(X_test)
    conf_matrix_kfold += confusion_matrix(Y_test, y_pred_fold)
    accuracy_kfold.append(accuracy_score(Y_test, y_pred_fold))
    
average_accuracy_kfold = np.mean(accuracy_kfold)

print("\n    K-Fold")
print("Matriz de Confusión:")
print(conf_matrix_kfold)
print("\nPrecisión:", average_accuracy_kfold)

# Validacion leave one out

num_clases = len(np.unique(diabetesY))
conf_matrix_leave = np.zeros((num_clases, num_clases))

accuracy_leave = []
foldsOne = leave_one_out_validation(diabetesX,diabetesY)
for X_train, X_test, Y_train, Y_test in foldsOne:
    model.fit(X_train, Y_train)
    y_pred_fold = model.predict(X_test)
    conf_matrix_leave += confusion_matrix(Y_test, y_pred_fold,labels=range(num_clases))
    accuracy_leave.append(accuracy_score(Y_test, y_pred_fold))

# Calcular la precisión promedio
average_accuracy_leave = np.mean(accuracy_leave)

# Mostrar resultados
print("\n    Leave-One-Out")
print("Matriz de Confusión acumulada:")
print(conf_matrix_leave)
print("\nPrecisión promedio:", average_accuracy_leave)

    Hold-out
Matriz de Confusión:
[[122  28]
 [ 31  50]]

Precisión 0.7445887445887446

    K-Fold
Matriz de Confusión:
[[417.  83.]
 [105. 163.]]

Precisión: 0.7551606288448395

    Leave-One-Out
Matriz de Confusión acumulada:
[[418.  82.]
 [107. 161.]]

Precisión promedio: 0.75390625


## Dataset parkinsons

In [8]:
featuresParkinsons = parkinsons[['UPDRS','FunctionalAssessment','MoCA','Tremor','Rigidity','Bradykinesia','Age','DietQuality',
                                 'CholesterolTotal','CholesterolLDL','CholesterolTriglycerides','SystolicBP','CholesterolHDL',
                                 'AlcoholConsumption','Diagnosis']]

In [9]:
parkinsonsX = featuresParkinsons.drop(columns=['Diagnosis'], axis = 1)
parkinsonsY = featuresParkinsons['Diagnosis']

# Validación Holdout
X_train, X_test, y_train, y_test = holdout_validation(parkinsonsX,parkinsonsY)
model.fit(X_train, y_train)
yPredict = model.predict(X_test)

conf_matrix = confusion_matrix(y_test, yPredict)
accuracy = accuracy_score(y_test, yPredict)

print("    Hold-out")
print("Matriz de Confusión:")
print(conf_matrix)
print("\nPrecisión", accuracy)

# Validación K-Fold
folds = k_fold_cross_validation(parkinsonsX, parkinsonsY)
conf_matrix_kfold = np.zeros((len(set(y_train)), len(set(y_train))))
accuracy_kfold = []

for X_train, X_test, Y_train, Y_test in folds:
    model.fit(X_train, Y_train)
    y_pred_fold = model.predict(X_test)
    conf_matrix_kfold += confusion_matrix(Y_test, y_pred_fold)
    accuracy_kfold.append(accuracy_score(Y_test, y_pred_fold))
    
average_accuracy_kfold = np.mean(accuracy_kfold)

print("\n    K-Fold")
print("Matriz de Confusión:")
print(conf_matrix_kfold)
print("\nPrecisión:", average_accuracy_kfold)

# Validacion leave one out

num_clases = len(np.unique(parkinsonsY))
conf_matrix_leave = np.zeros((num_clases, num_clases))

accuracy_leave = []
foldsOne = leave_one_out_validation(parkinsonsX,parkinsonsY)
for X_train, X_test, Y_train, Y_test in foldsOne:
    model.fit(X_train, Y_train)
    y_pred_fold = model.predict(X_test)
    conf_matrix_leave += confusion_matrix(Y_test, y_pred_fold,labels=range(num_clases))
    accuracy_leave.append(accuracy_score(Y_test, y_pred_fold))

# Calcular la precisión promedio
average_accuracy_leave = np.mean(accuracy_leave)

# Mostrar resultados
print("\n    Leave-One-Out")
print("Matriz de Confusión acumulada:")
print(conf_matrix_leave)
print("\nPrecisión promedio:", average_accuracy_leave)

    Hold-out
Matriz de Confusión:
[[173  67]
 [ 60 332]]

Precisión 0.7990506329113924

    K-Fold
Matriz de Confusión:
[[ 563.  238.]
 [ 185. 1119.]]

Precisión: 0.7990273076055067

    Leave-One-Out
Matriz de Confusión acumulada:
[[ 554.  247.]
 [ 181. 1123.]]

Precisión promedio: 0.7966745843230404
