Библиотечки

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.datasets import load_iris, load_wine, load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
class KNN:
    def __init__(self, k=3, metric='euclidean'):
        self.k = k
        self.metric = metric
        self.X_train = None
        self.y_train = None
    def fit(self, X, y):
        self.X_train = np.array(X)
        self.y_train = np.array(y)
    def predict(self, X):
        X = np.array(X)
        preds = []
        for x in X:
            if self.metric == 'euclidean':
                d = [np.sqrt(np.sum((x - x_train)**2)) for x_train in self.X_train]
            elif self.metric == 'manhattan':
                d = [np.sum(np.abs(x - x_train)) for x_train in self.X_train]
            elif self.metric == 'chebyshev':
                d = [np.max(np.abs(x - x_train)) for x_train in self.X_train]
            k_idx = np.argsort(d)[:self.k]
            k_labels = [self.y_train[i] for i in k_idx]
            best = Counter(k_labels).most_common(1)[0][0]
            preds.append(best)
        return np.array(preds)

In [3]:
def evaluate(X, y, k_value, metric):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    knn = KNN(k=k_value, metric=metric)
    knn.fit(X_train_scaled, y_train)
    y_pred = knn.predict(X_test_scaled)
    return y_test, y_pred

In [4]:
def settings():
    iris_dt = load_iris()
    wine_dt = load_wine()
    cancer_dt = load_breast_cancer()
    result = {
        'iris': [iris_dt.data, iris_dt.target],
        'wine': [wine_dt.data, wine_dt.target],
        'cancer': [cancer_dt.data, cancer_dt.target]
    }
    metrics = ['euclidean', 'manhattan', 'chebyshev']
    all_results = []
    for dataset_name, (X, y) in result.items():
        for metric in metrics:
            for k_value in range(1, 6):
                y_test, y_pred = evaluate(X, y, k_value, metric)
                accuracy = accuracy_score(y_test, y_pred)
                all_results.append({
                    'Датасет': dataset_name.capitalize(),
                    'Метрика': metric,
                    'k': k_value,
                    'Точность': round(accuracy, 4),
                    'Размер обучающей выборки': int(len(X) * 0.7),
                    'Размер тестовой выборки': int(len(X) * 0.3),
                    'Количество классов': len(np.unique(y)),
                    'Количество признаков': X.shape[1]
                })
    df_results = pd.DataFrame(all_results)
    df_results = df_results.sort_values(['Датасет', 'Метрика', 'k'])
    return df_results

In [6]:
def graph(df):
  fig = make_subplots(
    rows=1, cols=3,
    subplot_titles=[f'Датасет: {dataset}' for dataset in df['Датасет'].unique()],
    horizontal_spacing=0.1
  )
  colors = {
    'euclidean': '#2E86AB',
    'manhattan': '#A23B72',
    'chebyshev': '#F18F01'
  }
  markers = {
    'euclidean': 'circle',
    'manhattan': 'square',
    'chebyshev': 'triangle-up'
  }
  for idx, dataset in enumerate(df['Датасет'].unique(), 1):
    dataset_data = df[df['Датасет'] == dataset]
    for metric in df['Метрика'].unique():
        metric_data = dataset_data[dataset_data['Метрика'] == metric]
        fig.add_trace(
            go.Scatter(
                x=metric_data['k'],
                y=metric_data['Точность'],
                mode='lines+markers+text',
                name=metric,
                legendgroup=metric,
                line=dict(color=colors[metric], width=2.5),
                marker=dict(
                    symbol=markers[metric],
                    size=10,
                    color=colors[metric],
                    line=dict(color='white', width=1)
                ),
                text=[f'{acc:.3f}' for acc in metric_data['Точность']],
                textposition='top center',
                textfont=dict(size=9, color=colors[metric]),
                showlegend=(idx == 1)
            ),
            row=1, col=idx
        )
  fig.show()


In [7]:
if __name__ == '__main__':
  df = settings()
  graph(df)