In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

# 1. Открытие файла с данными
data = pd.read_csv('penguins.csv')

# 2. Посмотрим на пропуски
print(data.isnull().sum())

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64


In [None]:
# Удалим строки, где пропуски по 2 и более признакам
data = data.dropna(thresh=len(data.columns)-2)

# Разделим данные на две части
data_with_nan = data[data['sex'].isnull()]
data = data[data['sex'].notnull()]

In [None]:
# 3. Классификация с использованием KNN
# Отделим целевой признак от нецелевых
X = data.drop('sex', axis=1)
y = data['sex']

In [None]:
# Разделим данные на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Кодирование категориальных признаков
encoder = OneHotEncoder(handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(X_train.select_dtypes(include=['object'])).toarray()
X_test_encoded = encoder.transform(X_test.select_dtypes(include=['object'])).toarray()


In [None]:
# Нормирование нецелевых признаков
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.select_dtypes(exclude=['object']))
X_test_scaled = scaler.transform(X_test.select_dtypes(exclude=['object']))


In [None]:
# Объединим закодированные и нормированные признаки
import numpy as np

X_train_final = np.hstack((X_train_scaled, X_train_encoded))
X_test_final = np.hstack((X_test_scaled, X_test_encoded))


In [None]:
# Подбор гиперпараметров
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'metric': ['euclidean', 'manhattan']
}
knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=5)
grid_search.fit(X_train_final, y_train)

In [None]:
# Обучение модели с наилучшими гиперпараметрами
best_knn = grid_search.best_estimator_
best_knn.fit(X_train_final, y_train)


In [None]:
# Проверка качества на тестовой выборке
y_pred = best_knn.predict(X_test_final)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9104477611940298


In [None]:
# 4. Заполнение пропусков в data_with_nan
data_with_nan_cleaned = data_with_nan.drop('sex', axis=1)

In [None]:
# Кодирование и нормирование
X_nan_encoded = encoder.transform(data_with_nan_cleaned.select_dtypes(include=['object'])).toarray()
X_nan_scaled = scaler.transform(data_with_nan_cleaned.select_dtypes(exclude=['object']))
X_nan_final = np.hstack((X_nan_scaled, X_nan_encoded))


In [None]:
# Предсказание пола
predicted_sex = best_knn.predict(X_nan_final)

In [None]:
# 5. Объединение предсказанных значений с данными
data_with_nan['sex'] = predicted_sex
print(data_with_nan[['species', 'island', 'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'sex']])


    species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
8    Adelie  Torgersen            34.1           18.1              193.0   
9    Adelie  Torgersen            42.0           20.2              190.0   
10   Adelie  Torgersen            37.8           17.1              186.0   
11   Adelie  Torgersen            37.8           17.3              180.0   
47   Adelie      Dream            37.5           18.9              179.0   
178  Gentoo     Biscoe            44.5           14.3              216.0   
218  Gentoo     Biscoe            46.2           14.4              214.0   
256  Gentoo     Biscoe            47.3           13.8              216.0   
268  Gentoo     Biscoe            44.5           15.7              217.0   

     body_mass_g     sex  
8         3475.0  female  
9         4250.0    male  
10        3300.0  female  
11        3700.0  female  
47        2975.0  female  
178       4100.0  female  
218       4650.0  female  
256       4725.0  femal