In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler


df = sns.load_dataset("penguins")
df.dropna(inplace=True)

df.head()


species_map = {"Adelie": 0, "Chinstrap": 1, "Gentoo": 2}
df["species"] = df["species"].map(species_map)

df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
y_train = df_train["species"].values
y_test = df_test["species"].values
df_train.drop("species", axis=1, inplace=True)
df_test.drop("species", axis=1, inplace=True)


cat_cols = ["island", "sex"]
num_cols = ["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]

train_dict = df_train[cat_cols + num_cols].to_dict(orient='records')
test_dict = df_test[cat_cols + num_cols].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_train_cat_num = dv.fit_transform(train_dict)
X_test_cat_num = dv.transform(test_dict)

feature_names = dv.get_feature_names_out()

num_indexes = []
cat_indexes = []
for i, feat in enumerate(feature_names):
    if feat in num_cols:
        num_indexes.append(i)
    else:
        cat_indexes.append(i)

scaler = StandardScaler()
X_train_num = X_train_cat_num[:, num_indexes]
X_test_num = X_test_cat_num[:, num_indexes]

X_train_num_scaled = scaler.fit_transform(X_train_num)
X_test_num_scaled = scaler.transform(X_test_num)

X_train = np.zeros_like(X_train_cat_num)
X_test = np.zeros_like(X_test_cat_num)

X_train[:, num_indexes] = X_train_num_scaled
X_train[:, cat_indexes] = X_train_cat_num[:, cat_indexes]

X_test[:, num_indexes] = X_test_num_scaled
X_test[:, cat_indexes] = X_test_cat_num[:, cat_indexes]


lr = LogisticRegression(solver='liblinear')
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
acc_lr = accuracy_score(y_test, y_pred_lr)
print("Accuracy logistic:", acc_lr)

svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
acc_svm = accuracy_score(y_test, y_pred_svm)
print("Accuracy SVM:", acc_svm)

dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Accuracy DT:", acc_dt)

knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)
acc_knn = accuracy_score(y_test, y_pred_knn)
print("Accuracy KNN:", acc_knn)

with open("../models/lr_model.pck", "wb") as f:
    pickle.dump((dv, scaler, lr), f)

with open("../models/svm_model.pck", "wb") as f:
    pickle.dump((dv, scaler, svm_model), f)

with open("../models/dt_model.pck", "wb") as f:
    pickle.dump((dv, scaler, dt_model), f)

with open("../models/knn_model.pck", "wb") as f:
    pickle.dump((dv, scaler, knn_model), f)

Accuracy logistic: 1.0
Accuracy SVM: 1.0
Accuracy DT: 0.9850746268656716
Accuracy KNN: 0.9850746268656716
