In [1]:
from sklearn.datasets import load_iris, fetch_openml
import numpy as np
from AMIA_part1 import QDA, LDA, TensorizedQDA
import pandas as pd
from sklearn.model_selection import train_test_split
import time
from AMIA_part1 import QDA, LDA, TensorizedQDA

In [2]:
# hiperparámetros
rng_seed = 8761

In [3]:
#obtenemos datos de IRIS:
def get_iris_dataset():
    data = load_iris()#CARGAMSO DATOS
    X_full = data.data
    y_full = np.array([data.target_names[y] for y in data.target.reshape(-1,1)])
    return X_full, y_full

In [4]:
#obtenemos datos de penguins:
def get_penguins():
    # get data
    #La función fetch_openml se utiliza para descargar conjuntos de datos de la biblioteca en línea OpenML, este es un repositorio de conjuntos de datos.
    df, tgt = fetch_openml(name="penguins", return_X_y=True, as_frame=True)

    # drop non-numeric columns
    df.drop(columns=["island","sex"], inplace=True)

    # drop rows with missing values
    #isna: devuelve una estructura de datos del mismo tamaño que la entrada, pero con valores booleanos que indican si cada elemento en la entrada es un valor faltante (True) o no (False).
    #con .sum(axis=1) hacemos la suma a lo largo de cada columna.
    ##==0, compara cada resultado de la suma con cero. Esto devuelve un DataFrame de booleanos (True o False) donde True indica que la suma de valores faltantes en la fila es igual a cero, es decir, la fila no tiene valores faltantes.
    mask = df.isna().sum(axis=1) == 0
    df = df[mask]
    tgt = tgt[mask]

    return df.values, tgt.to_numpy().reshape(-1,1)

In [5]:
#pedimos los datos de iris:
X_full, y_full = get_iris_dataset()

In [6]:
#dividimos los datos:
def split_transpose(X, y, test_sz, random_state):
    # split
    X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.3, random_state=rng_seed)

    # transpose so observations are column vectors
    return X_train.T, y_train.T, X_test.T, y_test.T

In [7]:
#función para calcular Accuracy:
def accuracy(y_true, y_pred):
  return (y_true == y_pred).mean()

In [8]:
#transponemos datos:
train_x, train_y, test_x, test_y = split_transpose(X_full, y_full, 0.3, rng_seed)

In [9]:
#CREAMOS DATA FRAME PARA ALMACENAR LOS DATOS:
df_results = pd.DataFrame(index=[], columns=["Modelo",	"Dataset",	"Seed",	"Error (train)",	"Error (test)"])

In [10]:
#distribuciones a priori uniforme:
priori_prob = np.array([1/3, 1/3, 1/3])

qda_uniform = QDA()

qda_uniform.fit(train_x, train_y, priori_prob)

train_acc = accuracy(train_y, qda_uniform.predict(train_x))
test_acc = accuracy(test_y, qda_uniform.predict(test_x))

In [11]:
#guardamos datos en el dataframe:
df_results.at[0, "Modelo"] = "QDAUniform"
df_results.at[0, "Dataset"] = "Iris"
df_results.at[0, "Seed"] = rng_seed
df_results.at[0, "Error (train)"] = 1-train_acc
df_results.at[0, "Error (test)"] = 1-test_acc


In [12]:
#Una clase con probabilidad 0.9, las demás 0.05 (probar las 3 combinaciones)
#primer combinacion:
priori_prob = np.array([0.9, 0.05, 0.05])

qda_differentprob = QDA()

qda_differentprob.fit(train_x, train_y, priori_prob)

train_acc = accuracy(train_y, qda_differentprob.predict(train_x))
test_acc = accuracy(test_y, qda_differentprob.predict(test_x))


df_results.at[1, "Modelo"] = "QDA with [0.9, 0.05, 0.05]"
df_results.at[1, "Dataset"] = "Iris"
df_results.at[1, "Seed"] = rng_seed
df_results.at[1, "Error (train)"] = 1-train_acc
df_results.at[1, "Error (test)"] = 1-test_acc

#segunda combinacion:
priori_prob = np.array([0.05, 0.9, 0.05])

qda_differentprob = QDA()

qda_differentprob.fit(train_x, train_y, priori_prob)

train_acc = accuracy(train_y, qda_differentprob.predict(train_x))
test_acc = accuracy(test_y, qda_differentprob.predict(test_x))


df_results.at[2, "Modelo"] = "QDA with [0.05, 0.9, 0.05]"
df_results.at[2, "Dataset"] = "Iris"
df_results.at[2, "Seed"] = rng_seed
df_results.at[2, "Error (train)"] = 1-train_acc
df_results.at[2, "Error (test)"] = 1-test_acc

#tercera combinacion:
priori_prob = np.array([0.05, 0.05, 0.9])

qda_differentprob = QDA()

qda_differentprob.fit(train_x, train_y, priori_prob)

train_acc = accuracy(train_y, qda_differentprob.predict(train_x))
test_acc = accuracy(test_y, qda_differentprob.predict(test_x))


df_results.at[3, "Modelo"] = "QDA with [0.05, 0.05, 0.9]"
df_results.at[3, "Dataset"] = "Iris"
df_results.at[3, "Seed"] = rng_seed
df_results.at[3, "Error (train)"] = 1-train_acc
df_results.at[3, "Error (test)"] = 1-test_acc




In [13]:
#LO MISMO PERO PARA PENGUINS:
#pedimos los datos a penguins:
X_penguins_full, y_penguins_full = get_penguins()

  warn(


In [14]:
#transponemos datos:
penguins_train_x, penguins_train_y, penguins_test_x, penguins_test_y = split_transpose(X_full, y_full, 0.4, rng_seed)

In [15]:
#caso uniforme:
priori_prob = np.array([1/3, 1/3, 1/3])

qda_uniform_penguins = QDA()

qda_uniform_penguins.fit(penguins_train_x, penguins_train_y, priori_prob)

penguins_train_acc = accuracy(penguins_train_y, qda_uniform.predict(penguins_train_x))
penguins_test_acc = accuracy(penguins_test_y, qda_uniform.predict(penguins_test_x))

df_results.at[4, "Modelo"] = "QDAUniform"
df_results.at[4, "Dataset"] = "Penguins"
df_results.at[4, "Seed"] = rng_seed
df_results.at[4, "Error (train)"] = 1-penguins_train_acc
df_results.at[4, "Error (test)"] = 1-penguins_test_acc



In [16]:
#Una clase con probabilidad 0.9, las demás 0.05 (probar las 3 combinaciones)
#primer combinacion:
priori_prob = np.array([0.9, 0.05, 0.05])

qda_penguins = QDA()

qda_penguins.fit(penguins_train_x, penguins_train_y, priori_prob)

penguins_train_acc = accuracy(penguins_train_y, qda_penguins.predict(penguins_train_x))
penguins_test_acc = accuracy(penguins_test_y, qda_penguins.predict(penguins_test_x))

df_results.at[5, "Modelo"] = "QDA with [0.9, 0.05, 0.05]"
df_results.at[5, "Dataset"] = "Penguins"
df_results.at[5, "Seed"] = rng_seed
df_results.at[5, "Error (train)"] = 1-penguins_train_acc
df_results.at[5, "Error (test)"] = 1-penguins_test_acc

#segunda combinacion:
priori_prob = np.array([0.05, 0.9, 0.05])

qda_penguins = QDA()

qda_penguins.fit(penguins_train_x, penguins_train_y, priori_prob)

penguins_train_acc = accuracy(penguins_train_y, qda_penguins.predict(penguins_train_x))
penguins_test_acc = accuracy(penguins_test_y, qda_penguins.predict(penguins_test_x))

df_results.at[6, "Modelo"] = "QDA with [0.05, 0.9, 0.05]"
df_results.at[6, "Dataset"] = "Penguins"
df_results.at[6, "Seed"] = rng_seed
df_results.at[6, "Error (train)"] = 1-penguins_train_acc
df_results.at[6, "Error (test)"] = 1-penguins_test_acc

#tercera combinacion:
priori_prob = np.array([0.05, 0.05, 0.9])

qda_penguins = QDA()

qda_penguins.fit(penguins_train_x, penguins_train_y, priori_prob)

penguins_train_acc = accuracy(penguins_train_y, qda_penguins.predict(penguins_train_x))
penguins_test_acc = accuracy(penguins_test_y, qda_penguins.predict(penguins_test_x))

df_results.at[7, "Modelo"] = "QDA with [0.05, 0.05, 0.9]"
df_results.at[7, "Dataset"] = "Penguins"
df_results.at[7, "Seed"] = rng_seed
df_results.at[7, "Error (train)"] = 1-penguins_train_acc
df_results.at[7, "Error (test)"] = 1-penguins_test_acc



In [17]:
#Todo lo anterior pero con LDA:

#caso uniforme
priori_prob = np.array([1/3, 1/3, 1/3])

lda_uniform = LDA()

lda_uniform.fit(train_x, train_y, priori_prob)

lda_train_acc = accuracy(train_y, lda_uniform.predict(train_x))
lda_test_acc = accuracy(test_y, lda_uniform.predict(test_x))

df_results.at[8, "Modelo"] = "LDAUniform"
df_results.at[8, "Dataset"] = "Iris"
df_results.at[8, "Seed"] = rng_seed
df_results.at[8, "Error (train)"] = 1-lda_train_acc
df_results.at[8, "Error (test)"] = 1-lda_test_acc




In [18]:
#Una clase con probabilidad 0.9, las demás 0.05 (probar las 3 combinaciones)
#primer combinacion:
priori_prob = np.array([0.9, 0.05, 0.05])

lda_differentprob = LDA()

lda_differentprob.fit(train_x, train_y, priori_prob)

lda_train_acc = accuracy(train_y, lda_differentprob.predict(train_x))
lda_test_acc = accuracy(test_y, lda_differentprob.predict(test_x))

df_results.at[9, "Modelo"] = "LDA with [0.9, 0.05, 0.05]"
df_results.at[9, "Dataset"] = "Iris"
df_results.at[9, "Seed"] = rng_seed
df_results.at[9, "Error (train)"] = 1-lda_train_acc
df_results.at[9, "Error (test)"] = 1-lda_test_acc

#segunda combinacion:
priori_prob = np.array([0.05, 0.9, 0.05])

lda_differentprob = LDA()

lda_differentprob.fit(train_x, train_y, priori_prob)

lda_train_acc = accuracy(train_y, lda_differentprob.predict(train_x))
lda_test_acc = accuracy(test_y, lda_differentprob.predict(test_x))

df_results.at[10, "Modelo"] = "LDA with [0.05, 0.9, 0.05]"
df_results.at[10, "Dataset"] = "Iris"
df_results.at[10, "Seed"] = rng_seed
df_results.at[10, "Error (train)"] = 1-lda_train_acc
df_results.at[10, "Error (test)"] = 1-lda_test_acc


#tercera combinacion:
priori_prob = np.array([0.05, 0.05, 0.9])

lda_differentprob = LDA()

lda_differentprob.fit(train_x, train_y, priori_prob)

lda_train_acc = accuracy(train_y, lda_differentprob.predict(train_x))
lda_test_acc = accuracy(test_y, lda_differentprob.predict(test_x))

df_results.at[11, "Modelo"] = "LDA with [0.05, 0.05, 0.9]"
df_results.at[11, "Dataset"] = "Iris"
df_results.at[11, "Seed"] = rng_seed
df_results.at[11, "Error (train)"] = 1-lda_train_acc
df_results.at[11, "Error (test)"] = 1-lda_test_acc




In [19]:
#LDA PARA PINGUINS:
lda_uniform_penguins = LDA()

lda_uniform_penguins.fit(penguins_train_x, penguins_train_y)

penguins_train_acc = accuracy(penguins_train_y, qda_uniform.predict(penguins_train_x))
penguins_test_acc = accuracy(penguins_test_y, qda_uniform.predict(penguins_test_x))
print(f"Train (apparent) error is {1-penguins_train_acc:.4f} while test error is {1-penguins_test_acc:.4f}")

df_results.at[12, "Modelo"] = "LDAUniform"
df_results.at[12, "Dataset"] = "Penguins"
df_results.at[12, "Seed"] = rng_seed
df_results.at[12, "Error (train)"] = 1-penguins_train_acc
df_results.at[12, "Error (test)"] = 1-penguins_test_acc



Train (apparent) error is 0.0286 while test error is 0.0000


In [20]:
#Una clase con probabilidad 0.9, las demás 0.05 (probar las 3 combinaciones)
#primer combinacion:
priori_prob = np.array([0.9, 0.05, 0.05])

lda_differentprob_penguins = LDA()

lda_differentprob_penguins.fit(penguins_train_x, penguins_train_y, priori_prob)

penguins_train_acc = accuracy(penguins_train_y, lda_differentprob_penguins.predict(penguins_train_x))
penguins_test_acc = accuracy(penguins_test_y, lda_differentprob_penguins.predict(penguins_test_x))

df_results.at[13, "Modelo"] = "LDA with [0.9, 0.05, 0.05]"
df_results.at[13, "Dataset"] = "Penguins"
df_results.at[13, "Seed"] = rng_seed
df_results.at[13, "Error (train)"] = 1-penguins_train_acc
df_results.at[13, "Error (test)"] = 1-penguins_test_acc

#segunda combinacion:
priori_prob = np.array([0.05, 0.9, 0.05])

lda_differentprob_penguins = LDA()

lda_differentprob_penguins.fit(penguins_train_x, penguins_train_y, priori_prob)

penguins_train_acc = accuracy(penguins_train_y, lda_differentprob_penguins.predict(penguins_train_x))
penguins_test_acc = accuracy(penguins_test_y, lda_differentprob_penguins.predict(penguins_test_x))

df_results.at[14, "Modelo"] = "LDA with [0.05, 0.9, 0.05]"
df_results.at[14, "Dataset"] = "Penguins"
df_results.at[14, "Seed"] = rng_seed
df_results.at[14, "Error (train)"] = 1-penguins_train_acc
df_results.at[14, "Error (test)"] = 1-penguins_test_acc

#tercera combinacion:
priori_prob = np.array([0.05, 0.05, 0.9])

lda_differentprob_penguins = LDA()

lda_differentprob_penguins.fit(penguins_train_x, penguins_train_y, priori_prob)

penguins_train_acc = accuracy(penguins_train_y, lda_differentprob_penguins.predict(penguins_train_x))
penguins_test_acc = accuracy(penguins_test_y, lda_differentprob_penguins.predict(penguins_test_x))

df_results.at[15, "Modelo"] = "LDA with [0.05, 0.05, 0.9]"
df_results.at[15, "Dataset"] = "Penguins"
df_results.at[15, "Seed"] = rng_seed
df_results.at[15, "Error (train)"] = 1-penguins_train_acc
df_results.at[15, "Error (test)"] = 1-penguins_test_acc

print(df_results)

                        Modelo   Dataset  Seed Error (train) Error (test)
0                   QDAUniform      Iris  8761      0.028571          0.0
1   QDA with [0.9, 0.05, 0.05]      Iris  8761      0.028571          0.0
2   QDA with [0.05, 0.9, 0.05]      Iris  8761       0.07619     0.022222
3   QDA with [0.05, 0.05, 0.9]      Iris  8761      0.057143          0.0
4                   QDAUniform  Penguins  8761      0.028571          0.0
5   QDA with [0.9, 0.05, 0.05]  Penguins  8761      0.028571          0.0
6   QDA with [0.05, 0.9, 0.05]  Penguins  8761       0.07619     0.022222
7   QDA with [0.05, 0.05, 0.9]  Penguins  8761      0.057143          0.0
8                   LDAUniform      Iris  8761      0.028571          0.0
9   LDA with [0.9, 0.05, 0.05]      Iris  8761      0.685714     0.622222
10  LDA with [0.05, 0.9, 0.05]      Iris  8761      0.685714     0.622222
11  LDA with [0.05, 0.05, 0.9]      Iris  8761      0.628571     0.755556
12                  LDAUniform  Pengui

Cuando se asigna una probabilidad alta (0.9) a una clase específica y probabilidades bajas (0.05) a las otras dos clases, el modelo parece aprender bien esa clase, ya que el error de test es bajo. La clase con probabilidad alta es más fácil de aprender.

Cuando la probabilidad alta se asigna a una clase diferente ([0.05, 0.05, 0.9]), el error de test aumenta, el modelo tiene más dificultades con esa clase.