# ***Imports***

In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, top_k_accuracy_score, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

# ***Limpieza de datos***

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

df = pd.read_csv("./.csv")
dfTempHist = pd.read_excel('./.xls', header=x) #Donde x es el número de filas que quieres saltar
# contar nan y drop
countNaN = df.isna().sum()
print(countNaN)
df = df.dropna(how="any")
countNaN = df.isna().sum()
print(countNaN)

# Dropear valores menores que x(int)
df = df.loc[df['colum'] >= x]
# Definimos DataFrame para el posible Histograma
df_Hist = df
# Pasar a numpy y dropear las columnas no numéricas como colores, formas, tipos, etc. Los id no entran.
cap = df["cap-shape"].to_numpy()
df = df.drop("cap-shape",axis=1)
gillA = df["gill-attachment"].to_numpy()
df = df.drop("gill-attachment", axis=1)
gillC = df["gill-color"].to_numpy()
df = df.drop("gill-color", axis=1)
stem = df["stem-color"].to_numpy()
df = df.drop("stem-color", axis=1)

# Codificar los datos no numericos
encoder = OneHotEncoder(sparse_output=False)
cap = encoder.fit_transform(cap.reshape(-1,1))
gillA = encoder.fit_transform(gillA.reshape(-1,1))
gillC = encoder.fit_transform(gillC.reshape(-1,1))
stem = encoder.fit_transform(stem.reshape(-1,1))

# Definimos Y: La variable de salida que define el problema
Y = df["class"].to_numpy()    # La pasamos a matriz numpy

# Las demas columnas definen X, se necesita pasar a matriz numpy
X = df.drop("class", axis=1)
X = X.to_numpy()

print(X.shape)
print(Y.shape)
# Añadir los datos no numéricos a la matriz.
X = np.hstack((X,cap, gillA, gillC, stem))
print(X.shape)

# Escalar X:
#     MinMaxScaler: Valores 0/1 -> scaler = MinMaxScaler(feature_range=(2, 3)) 2/3
#     StandarScaler: Rango de valores amplio
X_scaled = MinMaxScaler().fit_transform(X) #MinMaxScaler para salidas entre 0 y 1

# Entrenamiento: test_size es el % de test que pide el enunciado, en este caso 80/20%
x_train, x_test, y_train, y_test = train_test_split(
    X_scaled, Y,
    test_size=0.2,
    random_state=42,
    stratify=Y
)

print(x_train.shape)
print(x_test.shape)

# ***Histograma***

In [None]:
import matplotlib.pyplot as plt

df_Hist.hist(bins=20, figsize=(12, 10))
plt.suptitle('Distribución de todas las variables numéricas', y=1.02)
plt.tight_layout()
plt.show()

# ***Perceptron***

In [None]:
from sklearn.neural_network import MLPClassifier
import time

t_p = time.time()
model_p = MLPClassifier(alpha=0.01, max_iter=3000, verbose=1, random_state=13).fit(x_train, y_train)
t_p = time.time() - t_p

In [None]:
y_pred_p = model_p.predict(x_test)
print(y_test)
print(y_pred_p)
acc_p = model_p.score(x_test, y_test)
print(acc_p)

# ***Segundo Algoritmo EJ: Random Forest***

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100,
                                  random_state=42,
                                  max_depth=30,
                                  min_samples_split=2,
                                  min_samples_leaf=1,
                                  n_jobs=-1,
                                  verbose=1
                                  )
t_init = time.time()
rf_model.fit(x_train, y_train)
t_rf = time.time() - t_init


In [None]:
y_pred_rf = rf_model.predict(x_test)
print(y_test)
print(y_pred_rf)
acc_rf = rf_model.score(x_test, y_test)
print(acc_rf)

# ***Ayuda a Explicacion***

In [None]:
from sklearn.metrics import confusion_matrix
print(f"Perceptron: {acc_p:.4f} Time: {t_p:.2f}")
print(f"Random Forest: {acc_rf:.4f} Time: {t_rf:.2f}")

def matrix_view(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)

    # Cambiar segun el problema
    cm_df = pd.DataFrame(cm, index=['Actual 0 (comestible)', 'Actual 1 (venenoso)'],
    columns=['Pred 0 (comestible)', 'Pred 1 (venenoso)'])
    print(cm_df)

matrix_view(y_test, y_pred_p)
matrix_view(y_test, y_pred_rf)

# ***SEGUN CHAT GPT!!!!!***

In [None]:
# Segun CHAT GPT!!!!!!!!
import pandas as pd
from ace_tools import display_dataframe_to_user

# Cargar el dataset original
df = pd.read_csv('/mnt/data/mushroom.csv')

# Renombrar columnas para consistencia (opcional)
df.rename(columns={
    'cap-diameter': 'Cap Diameter',
    'cap-shape': 'Cap Shape',
    'gill-attachment': 'Gill Attachment',
    'gill-color': 'Gill Color',
    'stem-height': 'Stem Height',
    'stem-width': 'Stem Width',
    'stem-color': 'Stem Color',
    'season': 'Season'
}, inplace=True)

# Mostrar las primeras filas para inspección inicial
display_dataframe_to_user("Dataset Original - Primeras Filas", df.head())

# Identificar valores faltantes y posibles errores
print("Valores nulos por columna:\n", df.isnull().sum())

# Limpieza de errores:
# 1. Reemplazar indicadores de valores erróneos ('?' o similares) por NaN
df.replace('?', pd.NA, inplace=True)

# 2. Eliminar filas con valores faltantes tras la sustitución
df.dropna(inplace=True)

# Codificación de variables categóricas mediante one-hot encoding
categorical_cols = ['Cap Shape', 'Gill Attachment', 'Gill Color', 'Stem Color']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Asegurar que 'Season' y la clase objetivo estén en el tipo correcto
df_encoded['Season'] = df_encoded['Season'].astype(float)
df_encoded['class'] = df_encoded['class'].astype(int)

# Mostrar las primeras filas del dataset limpio y formateado
display_dataframe_to_user("Dataset Limpio y Formateado - Primeras Filas", df_encoded.head())

# Guardar el dataset limpio para uso posterior
df_encoded.to_csv('/mnt/data/mushroom_clean.csv', index=False)

df_encoded.shape


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from ace_tools import display_dataframe_to_user

# Cargar el dataset limpio
df = pd.read_csv('/mnt/data/mushroom_clean.csv')

# Separar variables predictoras y objetivo
X = df.drop('class', axis=1)
y = df['class']

# Dividir en entrenamiento (80%) y prueba (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenar multilayer perceptron
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_test)
accuracy_mlp = accuracy_score(y_test, y_pred_mlp)

# Entrenar random forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

# Mostrar resultados
results = pd.DataFrame({
    'Modelo': ['Multilayer Perceptron', 'Random Forest'],
    'Accuracy': [accuracy_mlp, accuracy_rf]
})
display_dataframe_to_user("Comparación de Accuracy", results)
