In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score, classification_report


In [2]:
df = sns.load_dataset('penguins')
df[:4]

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,


In [3]:
df.describe(include='object')

Unnamed: 0,species,island,sex
count,344,344,333
unique,3,3,2
top,Adelie,Biscoe,Male
freq,152,168,168


In [5]:
df = df.query('species != "Chinstrap"')
df.reset_index(drop=True, inplace=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 276 entries, 0 to 275
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            276 non-null    object 
 1   island             276 non-null    object 
 2   bill_length_mm     274 non-null    float64
 3   bill_depth_mm      274 non-null    float64
 4   flipper_length_mm  274 non-null    float64
 5   body_mass_g        274 non-null    float64
 6   sex                265 non-null    object 
dtypes: float64(4), object(3)
memory usage: 15.2+ KB


In [8]:
df.dropna(inplace=True)
df.shape

(265, 7)

In [9]:
X = pd.get_dummies(df.drop('species', axis=1))
y = df['species']

# заменим названия классов на метки
dict_class = {'Adelie': 0, 'Gentoo': 1}
y = y.map(dict_class)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    test_size=0.2,
                                                    random_state=42)
# нормируем данные
mm = MinMaxScaler()
X_train_std = mm.fit_transform(X_train)
X_test_std = mm.transform(X_test)

In [10]:
clf = KNeighborsClassifier()
clf.fit(X_train_std, y_train)

KNeighborsClassifier()

In [11]:
# посмотрим на метрики
y_train_proba = clf.predict_proba(X_train_std)
y_proba = clf.predict_proba(X_test_std)

y_train_pred = clf.predict(X_train_std)
y_pred = clf.predict(X_test_std)

print(f'ROC-AUC train = {roc_auc_score(y_train, y_train_proba[:,1])}')
print(f'ROC-AUC test = {roc_auc_score(y_test, y_proba[:,1])}')

print(classification_report(y_test, y_pred, target_names=dict_class))

ROC-AUC train = 1.0
ROC-AUC test = 1.0
              precision    recall  f1-score   support

      Adelie       1.00      1.00      1.00        29
      Gentoo       1.00      1.00      1.00        24

    accuracy                           1.00        53
   macro avg       1.00      1.00      1.00        53
weighted avg       1.00      1.00      1.00        53

