In [3]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
from math import sqrt, ceil

# Módulos de ML
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import classification_report, roc_curve, roc_auc_score, ConfusionMatrixDisplay
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
# Regresión logística
from sklearn.linear_model import LogisticRegression
# Árboles de decisión
from sklearn.tree import DecisionTreeClassifier
# K-Nearest Neighbors (KNN)
from sklearn.neighbors import KNeighborsClassifier
# Máquinas de vectores de soporte (SVM)
from sklearn.svm import SVC
# Random Forest
from sklearn.ensemble import RandomForestClassifier

import pandas as pd
from ucimlrepo import fetch_ucirepo

import numpy as np

data = fetch_ucirepo(id =73)
X = data.data.features 
y = data.data.targets 

df = pd.DataFrame(X, columns=data.data.feature_names)
df['target'] = y
df

print(df.describe())

label_map = {'p': 'venenoso', 'e': 'comestible'}
df_labels = df['target'].map(label_map)

df_missing = df.copy()
mask = np.random.rand(*df_missing.iloc[:, :-1].shape) < 0.05  
df_missing.iloc[:, :-1] = df_missing.iloc[:, :-1].mask(mask)

df_missing.describe()
df_missing.isnull().sum()

df2 = df.iloc[:-2754].reset_index(drop=True)
df2.describe()

from sklearn.preprocessing import LabelEncoder as Le
for col in df2.columns:
    encoder = Le()
    df2[col] = encoder.fit_transform(df2[col])

scaler = StandardScaler()
X_standard = scaler.fit_transform(df2.iloc[:, :-1])  # Only scale features, not target

df_X = pd.DataFrame(X_standard, columns=df2.columns[:-1], index=df2.index)

df_X.describe()

y = df2['target']

X_train, X_test, y_train, y_test = train_test_split(X_standard, y, stratify=y, test_size=0.2, random_state=42)
print(df2.head())

# Hiperparametros para el modelo de Random Forest

from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [2, 4, 8],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf = RandomForestClassifier(random_state=42)
grid = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)

print("Mejores hiperparámetros:", grid.best_params_)
print("Mejor score de validación:", grid.best_score_)

# Mejores Hiperparametros para Regresion Logistica

log_reg = LogisticRegression(max_iter=1000)
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs']
}
grid_search = GridSearchCV(estimator=log_reg, param_grid=param_grid, cv=5,
                           scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)
best_log_reg = grid_search.best_estimator_
print("Mejores hiperparámetros:", grid_search.best_params_)
print("Mejor precisión en validación cruzada:", grid_search.best_score_)

       cap-shape cap-surface cap-color bruises  odor gill-attachment  \
count       8124        8124      8124    8124  8124            8124   
unique         6           4        10       2     9               2   
top            x           y         n       f     n               f   
freq        3656        3244      2284    4748  3528            7914   

       gill-spacing gill-size gill-color stalk-shape  ...  \
count          8124      8124       8124        8124  ...   
unique            2         2         12           2  ...   
top               c         b          b           t  ...   
freq           6812      5612       1728        4608  ...   

       stalk-color-above-ring stalk-color-below-ring veil-type veil-color  \
count                    8124                   8124      8124       8124   
unique                      9                      9         1          4   
top                         w                      w         p          w   
freq                     