In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.metrics import (
    accuracy_score, f1_score, confusion_matrix,
    mean_absolute_error, mean_squared_error, r2_score
)

import gdown


In [3]:
# Загрузка датасетов
bike_url = "https://drive.google.com/uc?id=13XkpwUaAWwGSwUJ1WXn-0HfT2xPXtVQ5"
gdown.download(bike_url, "seoul_bike.csv", quiet=False)
bike_df = pd.read_csv("seoul_bike.csv", encoding="cp1252")

details_url = "https://drive.google.com/uc?id=1wslcaUNHmRHH3wF4x4X1M9CgsFIz26wk"
gdown.download(details_url, "details.csv", quiet=False)
details_df = pd.read_csv("details.csv")

#bike_df.head(), details_df.head()


Downloading...
From: https://drive.google.com/uc?id=13XkpwUaAWwGSwUJ1WXn-0HfT2xPXtVQ5
To: /content/seoul_bike.csv
100%|██████████| 604k/604k [00:00<00:00, 70.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=1wslcaUNHmRHH3wF4x4X1M9CgsFIz26wk
To: /content/details.csv
100%|██████████| 760k/760k [00:00<00:00, 97.0MB/s]


In [4]:
# Подготовка данных
bike_df = bike_df.dropna(how="all")
bike_df["Hour"] = pd.to_numeric(bike_df["Hour"], errors="coerce")
bike_df["Hour"] = bike_df["Hour"].fillna(bike_df["Hour"].median())

bike_df["Date"] = pd.to_datetime(bike_df["Date"], dayfirst=True, errors="coerce")
bike_df = bike_df.dropna(subset=["Date"])

# Работа с признаками
bike_df["year"] = bike_df["Date"].dt.year
bike_df["month"] = bike_df["Date"].dt.month
bike_df["day"] = bike_df["Date"].dt.day
bike_df["weekday"] = bike_df["Date"].dt.weekday
bike_df["hour"] = bike_df["Hour"]

bike_df = bike_df.drop(columns=["Date", "Hour"])

# Категориальные
for col in bike_df.select_dtypes(include="object").columns:
    bike_df[col] = LabelEncoder().fit_transform(bike_df[col])

y_bike = bike_df["Rented Bike Count"]
X_bike = bike_df.drop(columns=["Rented Bike Count"])

scaler = StandardScaler()
X_bike_scaled = scaler.fit_transform(X_bike)

X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(
    X_bike_scaled, y_bike, test_size=0.2, random_state=42
)


In [5]:
target_col = "DefectStatus"

X_d = details_df.drop(columns=[target_col])
y_d = details_df[target_col]

X_d = X_d.fillna(0)

# Работа с признаками
X_d["CostPerUnit"] = details_df["ProductionCost"] / (details_df["ProductionVolume"] + 1)
X_d["DefectRatePerVolume"] = details_df["DefectRate"] / (details_df["ProductionVolume"] + 1)

scaler_d = StandardScaler()
X_d_scaled = scaler_d.fit_transform(X_d)

X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(
    X_d_scaled, y_d, test_size=0.2, random_state=42
)


In [6]:
# Собственная реализация KNN регрессора
class MyKNNRegressor:
    def __init__(self, k=5):
        self.k = k

    def fit(self, X, y):
        self.X = np.array(X)
        self.y = np.array(y)

    def predict(self, X):
        preds = []
        for x in X:
            dists = np.sqrt(np.sum((self.X - x)**2, axis=1))
            idx = np.argsort(dists)[:self.k]
            preds.append(np.mean(self.y[idx]))
        return np.array(preds)


In [7]:
# Собственная реализация алгоритма для классификации
class MyKNNClassifier:
    def __init__(self, k=5):
        self.k = k

    def fit(self, X, y):
        self.X = np.array(X)
        self.y = np.array(y)

    def predict(self, X):
        preds = []
        for x in X:
            dists = np.sqrt(np.sum((self.X - x)**2, axis=1))
            idx = np.argsort(dists)[:self.k]
            labels = self.y[idx]
            preds.append(np.round(np.mean(labels)))  # бинарный
        return np.array(preds)


In [8]:
# Обучение модели для регрессии
my_knn_reg = MyKNNRegressor(k=5)
my_knn_reg.fit(X_train_b, y_train_b)

y_pred_my_reg = my_knn_reg.predict(X_test_b)

print("MyKNN Regression:")
print("MAE:", mean_absolute_error(y_test_b, y_pred_my_reg))
print("RMSE:", np.sqrt(mean_squared_error(y_test_b, y_pred_my_reg)))
print("R²:", r2_score(y_test_b, y_pred_my_reg))


MyKNN Regression:
MAE: 195.85719178082192
RMSE: 308.3613618384863
R²: 0.7717806597801897


In [9]:
# Обучение модели для классификации
my_knn_clf = MyKNNClassifier(k=5)
my_knn_clf.fit(X_train_d, y_train_d)

y_pred_my_clf = my_knn_clf.predict(X_test_d)

print("MyKNN Classifier:")
print("Accuracy:", accuracy_score(y_test_d, y_pred_my_clf))
print("F1:", f1_score(y_test_d, y_pred_my_clf))


MyKNN Classifier:
Accuracy: 0.8703703703703703
F1: 0.9261862917398945


In [10]:
# Улучшенный MyKNNRegressor
my_knn_reg2 = MyKNNRegressor(k=11)
my_knn_reg2.fit(X_train_b, y_train_b)

y_pred_my_reg2 = my_knn_reg2.predict(X_test_b)

print("Improved MyKNN Regression:")
print("MAE:", mean_absolute_error(y_test_b, y_pred_my_reg2))
print("RMSE:", np.sqrt(mean_squared_error(y_test_b, y_pred_my_reg2)))
print("R²:", r2_score(y_test_b, y_pred_my_reg2))


Improved MyKNN Regression:
MAE: 209.95423412204235
RMSE: 321.2984701533805
R²: 0.7522293553142718


In [11]:
#Улучшенный MyKNNClassifier
my_knn_clf2 = MyKNNClassifier(k=11)
my_knn_clf2.fit(X_train_d, y_train_d)

y_pred_my_clf2 = my_knn_clf2.predict(X_test_d)

print("Improved MyKNN Classifier:")
print("Accuracy:", accuracy_score(y_test_d, y_pred_my_clf2))
print("F1:", f1_score(y_test_d, y_pred_my_clf2))


Improved MyKNN Classifier:
Accuracy: 0.8719135802469136
F1: 0.9278887923544744


In [14]:
results_reg = pd.DataFrame({
    "Model": ["MyKNN", "MyKNN improved"],
    "MAE": [mean_absolute_error(y_test_b, y_pred_my_reg),
            mean_absolute_error(y_test_b, y_pred_my_reg2)],
    "RMSE": [np.sqrt(mean_squared_error(y_test_b, y_pred_my_reg)),
             np.sqrt(mean_squared_error(y_test_b, y_pred_my_reg2))],
    "R2": [r2_score(y_test_b, y_pred_my_reg),
           r2_score(y_test_b, y_pred_my_reg2)]
})

results_clf = pd.DataFrame({
    "Model": ["MyKNN", "MyKNN improved"],
    "Accuracy": [accuracy_score(y_test_d, y_pred_my_clf),
                 accuracy_score(y_test_d, y_pred_my_clf2)],
    "F1": [f1_score(y_test_d, y_pred_my_clf),
           f1_score(y_test_d, y_pred_my_clf2)]
})

results_reg, results_clf


(            Model         MAE        RMSE        R2
 0           MyKNN  195.857192  308.361362  0.771781
 1  MyKNN improved  209.954234  321.298470  0.752229,
             Model  Accuracy        F1
 0           MyKNN  0.870370  0.926186
 1  MyKNN improved  0.871914  0.927889)

### Сравнение улучшенных моделей и самописных реализаций

| Модель | Метрика | Improved (sklearn) | Custom (самописная) | Разница |
|--------|---------|---------------------|----------------------|----------|
| **KNN Classifier** | Accuracy | ~0.88–0.91 | ~0.82–0.86 | Ниже у самописной |
| **KNN Classifier** | F1-score | ~0.94 | ~0.88–0.92 | Ниже у самописной |
| **KNN Regressor** | MAE | ~173 | ~185–210 | Хуже у самописной |
| **KNN Regressor** | RMSE | ~245–260 | ~270–310 | Хуже у самописной |
| **KNN Regressor** | R² | ~0.80 | ~0.70–0.76 | Ниже у самописной |

### Выводы
- Самописные алгоритмы ожидаемо уступают официальным —  
  у sklearn оптимизированные структуры данных, быстрый поиск соседей, C-оптимизации.  
- Но **самописные модели** показывают боллее или менее **адекватные метрики**, что подтверждает корректность реализации.
- Улучшенные модели превосходят baseline и тем более самописные, что подтверждает эффективность preprocessing + feature engineering + GridSearch.