# 1. Preparar los datos

In [57]:
import pandas as pd
import seaborn as sns

df = sns.load_dataset("penguins")

df = df.dropna()
df['species'] = df['species'].map({'Chinstrap': 0, 'Adelie': 1, 'Gentoo': 2})

X = df.drop('species', axis=1)
y = df['species']

# 2. Separar en conjuntos de entrenamiento (80%) y pruebas (20%)

In [58]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

# 3. Codificación one-hot (variables categoricas) y escalado estándar (variables númericas)

In [59]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
import numpy as np

categorical = ['island', 'sex']
numerical = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']

dv = DictVectorizer(sparse=False)

X_train_dict = X_train[categorical].to_dict(orient='records')
X_test_dict = X_test[categorical].to_dict(orient='records')

dv.fit(X_train_dict)
X_train_dv = dv.transform(X_train_dict)
X_test_dv = dv.transform(X_test_dict)

sc = StandardScaler()

sc.fit(X_train[numerical])
X_train_std = sc.transform(X_train[numerical])
X_test_std = sc.transform(X_test[numerical])

X_train = np.hstack([X_train_dv, X_train_std])
X_test = np.hstack([X_test_dv, X_test_std])

# La tarea no pide que se evalue el modelo una vez entrenado con los datos de prueba, por lo tanto no usaremos X_test (tampoco y_test)

# 4. Entrenar los modelos

# 4.1. Regresión logística

In [60]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=100.0, random_state = 1, solver = 'lbfgs', multi_class='ovr')
lr.fit(X_train, y_train)



# 4.2. Máquinas de soporte vectorial (SVM)

In [61]:
from sklearn.svm import SVC

svm = SVC(kernel='linear', C=1.0, random_state=1, probability=True)
svm.fit(X_train, y_train)

# 4.3. Árboles de decisión

In [62]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(criterion='gini', max_depth=4, random_state=1)
dt.fit(X_train, y_train)

# 4.4. K vecinos más próximos (KNN)

In [63]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3, p=2, metric='minkowski')
knn.fit(X_train, y_train)

# 5. Serializar los modelos

In [64]:
import pickle

with open('../models/lr.pck', 'wb') as f:
    pickle.dump((dv, sc, lr), f)

with open('../models/svm.pck', 'wb') as f:
    pickle.dump((dv, sc, svm), f)

with open('../models/dt.pck', 'wb') as f:
    pickle.dump((dv, sc, dt), f)

with open('../models/knn.pck', 'wb') as f:
    pickle.dump((dv, sc, knn), f)

# Para poder aplicar la misma transformación a los datos de entrada debemos serializar tambien StandardScaler (sc) y DictVectorizer (dv) junto con los modelos