In [23]:
import seaborn as sns
import pandas as pd
import numpy as np
df = sns.load_dataset("penguins")
df.head().T

Unnamed: 0,0,1,2,3,4
species,Adelie,Adelie,Adelie,Adelie,Adelie
island,Torgersen,Torgersen,Torgersen,Torgersen,Torgersen
bill_length_mm,39.1,39.5,40.3,,36.7
bill_depth_mm,18.7,17.4,18.0,,19.3
flipper_length_mm,181.0,186.0,195.0,,193.0
body_mass_g,3750.0,3800.0,3250.0,,3450.0
sex,Male,Female,Female,,Female


## Borramos las filas con algun valor NaN

In [24]:
limpio = df[df.notnull().all(1)]
limpio

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male
...,...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,Female
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


## Pasamos las especies a numero para que sean mas faciles de procesar

In [25]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
limpio.loc[:,'species']= encoder.fit_transform(limpio['species'])

limpio

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,Torgersen,39.1,18.7,181.0,3750.0,Male
1,0,Torgersen,39.5,17.4,186.0,3800.0,Female
2,0,Torgersen,40.3,18.0,195.0,3250.0,Female
4,0,Torgersen,36.7,19.3,193.0,3450.0,Female
5,0,Torgersen,39.3,20.6,190.0,3650.0,Male
...,...,...,...,...,...,...,...
338,2,Biscoe,47.2,13.7,214.0,4925.0,Female
340,2,Biscoe,46.8,14.3,215.0,4850.0,Female
341,2,Biscoe,50.4,15.7,222.0,5750.0,Male
342,2,Biscoe,45.2,14.8,212.0,5200.0,Female


# Separar datos para 80% de los datos como datos de entrenamiento y el restante 20% como datos de prueba

In [26]:
from sklearn.model_selection import train_test_split

df_train_full, df_test = train_test_split(limpio, test_size=0.2, random_state=1)
y_train = df_train_full['species'].values
y_train = y_train.astype(int)

## Standarización

In [27]:
from sklearn.preprocessing import StandardScaler
numerical = ['bill_length_mm','bill_depth_mm','flipper_length_mm','body_mass_g']
sc = StandardScaler()
sc.fit(df_train_full[numerical])
X_train_std = sc.transform(df_train_full[numerical])

## Categorizar variabels 

In [28]:
from sklearn.metrics import mutual_info_score

categorical = ['island','sex']

calculate_mi = lambda col: mutual_info_score(col,df_train_full['species'])

df_mi = df_train_full[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')
print("Categorical ")
print(df_mi)
print(f"\n Numerical ")
print(df_train_full[numerical].corrwith(df_train_full['species']))

Categorical 
              MI
island  0.498816
sex     0.002896

 Numerical 
bill_length_mm       0.741803
bill_depth_mm       -0.742992
flipper_length_mm    0.863216
body_mass_g          0.770591
dtype: float64


In [29]:
train_dict = df_train_full[categorical + numerical].to_dict(orient='records')
train_dict[0]

{'island': 'Biscoe',
 'sex': 'Female',
 'bill_length_mm': 43.4,
 'bill_depth_mm': 14.4,
 'flipper_length_mm': 218.0,
 'body_mass_g': 4600.0}

## DictVectorizer

In [30]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)


In [10]:
X_train = dv.transform(train_dict)
X_train[0]

array([1.44e+01, 4.34e+01, 4.60e+03, 2.18e+02, 1.00e+00, 0.00e+00,
       0.00e+00, 1.00e+00, 0.00e+00])

In [11]:
dv.get_feature_names_out()

array(['bill_depth_mm', 'bill_length_mm', 'body_mass_g',
       'flipper_length_mm', 'island=Biscoe', 'island=Dream',
       'island=Torgersen', 'sex=Female', 'sex=Male'], dtype=object)

## Logistic  Regression

In [12]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='liblinear')
lr.fit(X_train_std,y_train)

## SVM

In [17]:
from sklearn.svm import SVC

svm = SVC(probability=True) 
svm.fit(X_train_std, y_train)

## Decicsion Tree

In [21]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier(criterion='gini', max_depth=5,random_state=1)
tree_model.fit(X_train_std,y_train)

## KNN

In [19]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=10, p=2, metric='minkowski')
knn.fit(X_train_std,y_train)

## Serialización

In [22]:
import pickle 

with open('../modelos/lr.pck','wb') as f:
    pickle.dump((sc,lr),f )

with open('../modelos/svm.pck','wb') as f:
    pickle.dump((sc,svm),f)
    
with open('../modelos/tree_model.pck','wb') as f:
    pickle.dump((sc,tree_model),f)
    
with open('../modelos/knn.pck','wb') as f:
    pickle.dump((sc,knn),f)