# Clasificadores

## Imports

In [130]:
# Librerías generales
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Datasets
from sklearn.datasets import load_iris, load_digits, load_breast_cancer, fetch_openml
import kagglehub
from kagglehub import KaggleDatasetAdapter
from tensorflow.keras.datasets import mnist

# Modelos
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification

# Preprocesamiento
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Procesamiento para el Email Dataset
nltk.download('stopwords')
nltk.download('punkt')

# Métricas
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, ConfusionMatrixDisplay

# Gráficas
from matplotlib.colors import ListedColormap

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lopez\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lopez\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Funciones auxiliares

### Iris Dataset

In [103]:
iris = load_iris()

# Guardamos en un Dataframe
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)

# Separamos etiquetas
X, y = iris.data, iris.target

# Separamos en conjuntos de entrenamiento y pruebas
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Escalamos los datos
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#### LinearSVC

In [104]:
# Crear el modelo
model = LinearSVC(random_state=42)

# Entrenar el modelo
model.fit(X_train, y_train)

# Predicciones con el conjunto de pruebas
y_pred = model.predict(X_test)

# Accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

# Precision
print(f"Precision: {precision_score(y_test, y_pred, average='weighted')}")

# Recall
print(f"Recall: {recall_score(y_test, y_pred, average='weighted')}")

# F1-Score
print(f"F1-Score: {f1_score(y_test, y_pred, average='weighted')}")

Accuracy: 0.9
Precision: 0.9023569023569025
Recall: 0.9
F1-Score: 0.8997493734335839


#### MLP

In [105]:
# Crear el modelo
model = MLPClassifier(hidden_layer_sizes=(50, 25), 
                          max_iter=500, activation='relu', random_state=42)

# Entrenar el modelo
model.fit(X_train, y_train)

# Predicciones con el conjunto de pruebas
y_pred = model.predict(X_test)

# Accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

# Precision
print(f"Precision: {precision_score(y_test, y_pred, average='weighted')}")

# Recall
print(f"Recall: {recall_score(y_test, y_pred, average='weighted')}")

# F1-Score
print(f"F1-Score: {f1_score(y_test, y_pred, average='weighted')}")

Accuracy: 0.9666666666666667
Precision: 0.9696969696969696
Recall: 0.9666666666666667
F1-Score: 0.9665831244778613


### Digits Dataset

In [110]:
digits = load_digits()

# Guardamos en un Dataframe
df = pd.DataFrame(data=digits.data, columns=digits.feature_names)

# Separamos etiquetasAC
X, y = digits.data, digits.target

# Separamos en conjuntos de entrenamiento y pruebas
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Escalamos los datos
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#### AdaBoost

In [111]:
# Crear el modelo
model = AdaBoostClassifier(n_estimators=100, random_state=0)

# Entrenar el modelo
model.fit(X_train, y_train)

# Predicciones con el conjunto de pruebas
y_pred = model.predict(X_test)

# Accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

# Precision
print(f"Precision: {precision_score(y_test, y_pred, average='weighted')}")

# Recall
print(f"Recall: {recall_score(y_test, y_pred, average='weighted')}")

# F1-Score
print(f"F1-Score: {f1_score(y_test, y_pred, average='weighted')}")

Accuracy: 0.8305555555555556
Precision: 0.8366671488637865
Recall: 0.8305555555555556
F1-Score: 0.8290715678437479


### Breast Cancer Wisconsin Dataset

In [113]:
cancer = load_breast_cancer()

# Guardamos en un Dataframe
df = pd.DataFrame(data=cancer.data, columns=cancer.feature_names)

# Separamos etiquetas
X, y = cancer.data, cancer.target

# Separamos en conjuntos de entrenamiento y pruebas
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Escalamos los datos
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#### NuSVC

In [114]:
# Crear el modelo
model = NuSVC(random_state=42)

# Entrenar el modelo
model.fit(X_train, y_train)

# Predicciones con el conjunto de pruebas
y_pred = model.predict(X_test)

# Accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

# Precision
print(f"Precision: {precision_score(y_test, y_pred, average='weighted')}")

# Recall
print(f"Recall: {recall_score(y_test, y_pred, average='weighted')}")

# F1-Score
print(f"F1-Score: {f1_score(y_test, y_pred, average='weighted')}")

Accuracy: 0.9385964912280702
Precision: 0.9390013495276655
Recall: 0.9385964912280702
F1-Score: 0.9380859556298152


### Titanic ML Dataset

In [106]:
# Guardamos en un Dataframe
df = kagglehub.dataset_load(
  KaggleDatasetAdapter.PANDAS,
  "yasserh/titanic-dataset",
  "Titanic-Dataset.csv",
)

# Visualización del Dataset
# print(df)

# Eliminamos ID y Ticket, no aportan nada en la predicción
df.drop(columns=["PassengerId", "Ticket"], inplace=True)

# Checamos si hay datos nulos
# df.isnull().sum()

# Llenamos Embarked con el valor más repetido (moda)
df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode()[0])

# Llenamos Age con la mediana de edades
df["Age"] = df["Age"].fillna(df["Age"].median())

# Nueva columna que reemplaza a Cabin con Has_Cabin
df["Has_Cabin"] = df["Cabin"].notnull().astype(int)
df.drop(columns=["Cabin"], inplace=True)

# Nueva columna que reemplaza a Name con Title
df["Title"] = df["Name"].apply(lambda x: x.split(", ")[1].split(".")[0].strip())
df.drop(columns=["Name"], inplace=True)
df["Title"] = df["Title"].replace(['Lady', 'the Countess','Capt','Col','Don','Dr',
                                   'Major','Rev','Sir','Jonkheer','Dona'], 'Rare')
df["Title"] = df["Title"].replace('Mlle', 'Miss')
df["Title"] = df["Title"].replace('Ms', 'Miss')
df["Title"] = df["Title"].replace('Mme', 'Mrs')

# Aplicamos Encode manual a la columna Sex
sex_map = {"male": 1, "female": 0}
df["Sex"] = df["Sex"].map(sex_map)

# Aplicamos One-Hot Encoding a las columnas Title y Embarked
df = pd.get_dummies(df, columns=['Title', 'Embarked'], drop_first=True)

# Dataset limpio
# print(df)

# Separamos etiquetas
X = df.drop('Survived', axis=1)
y = df['Survived']

# Separamos en conjuntos de entrenamiento y pruebas
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Escalamos los datos
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#### KNN

In [107]:
# Crear el modelo
model = KNeighborsClassifier(n_neighbors=5)

# Entrenar el modelo
model.fit(X_train, y_train)

# Predicciones con el conjunto de pruebas
y_pred = model.predict(X_test)

# Accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

# Precision
print(f"Precision: {precision_score(y_test, y_pred)}")

# Recall
print(f"Recall: {recall_score(y_test, y_pred)}")

# F1-Score
print(f"F1-Score: {f1_score(y_test, y_pred)}")

Accuracy: 0.8212290502793296
Precision: 0.7846153846153846
Recall: 0.7391304347826086
F1-Score: 0.7611940298507462


#### GradientBoost

In [108]:
# Crear el modelo
model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
    max_depth=1, random_state=0)

# Entrenar el modelo
model.fit(X_train, y_train)

# Predicciones con el conjunto de pruebas
y_pred = model.predict(X_test)

# Accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

# Precision
print(f"Precision: {precision_score(y_test, y_pred)}")

# Recall
print(f"Recall: {recall_score(y_test, y_pred)}")

# F1-Score
print(f"F1-Score: {f1_score(y_test, y_pred)}")

Accuracy: 0.7932960893854749
Precision: 0.7424242424242424
Recall: 0.7101449275362319
F1-Score: 0.725925925925926


### MNIST Handwritten Digits Dataset

In [125]:
# Importamos el Dataset
(X_train, y_train), (X_test, y_test) = mnist.load_data()

# Redimensionamos los datos para que DecisionTree los acepte
X_train = X_train.reshape(X_train.shape[0], -1) / 255.0
X_test = X_test.reshape(X_test.shape[0], -1) / 255.0

#### DecisionTree

In [128]:
# Crear el modelo
model = DecisionTreeClassifier(max_depth=15, min_samples_leaf=5, 
                               random_state=42)

# Entrenar el modelo
model.fit(X_train, y_train)

# Predicciones con el conjunto de pruebas
y_pred = model.predict(X_test)

# Accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

# Precision
print(f"Precision: {precision_score(y_test, y_pred, average='weighted')}")

# Recall
print(f"Recall: {recall_score(y_test, y_pred, average='weighted')}")

# F1-Score
print(f"F1-Score: {f1_score(y_test, y_pred, average='weighted')}")

Accuracy: 0.8813
Precision: 0.8810354798739223
Recall: 0.8813
F1-Score: 0.8810517484718479


### Spam Vs. Ham Email Dataset

In [146]:
# Importar Dataset
df = kagglehub.dataset_load(
  KaggleDatasetAdapter.PANDAS,
  "meruvulikith/190k-spam-ham-email-dataset-for-classification",
  "spam_Emails_data.csv",
  pandas_kwargs={
      'encoding': 'latin-1',
      'engine': 'python',
      'on_bad_lines': 'skip'
  }
)

# Imprimimos el Dataset
print(df)

       PK- fhnXdî<Ùÿÿÿÿÿÿÿÿ  spam_Emails_data.csv  ;ËQ    ðÈ¬    ì½ËìH¸¯ S}»E²ngVufuçffÉ=¿À »[8 ÃÅÃ=<³âpÈ¦pÑáÌó©üùÑsTÍ ¸{Ä{ãfKsX]áîÁ`¦o=ªÖ¸Ò7ßOþizø½k¿ÿîøn?8ù¿Ø5bb:ø"v~?ÜT]7S  \
0       ÞµÅÇ~uØ_ð;w¢â¹¨ý.t¾{µöÅØÇN.)]SFWÔam£...                                                                                                                                                               
1                ÁïÝä¹¾òEé.ø¢Oëö¾É¶wãNòmª¹ÁÅòïê¢                                                                                                                                                               
2       cåÆa×åQC]áù9tû×¸¡õ]p]QÍSãñZ~Ã[°|[...                                                                                                                                                               
3       UvèäoáWáG¹j.äsÜÉ5ho*¹-/XÍ¦¾èòu·ìêã0...                                                                                 

#### SVC

#### NaiveBayes