In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Загрузка данных
data = pd.read_csv('../datasets/smoking_driking_dataset_Ver01.csv')

In [2]:
data['sex'] = data['sex'].apply(lambda x: 0 if x == 'female' else 1)
data['DRK_YN'] = data['DRK_YN'].apply(lambda x: 0 if x == 'N' else 1)
print(data)

        sex  age  height  weight  waistline  sight_left  sight_right  \
0         1   35     170      75       90.0         1.0          1.0   
1         1   30     180      80       89.0         0.9          1.2   
2         1   40     165      75       91.0         1.2          1.5   
3         1   50     175      80       91.0         1.5          1.2   
4         1   50     165      60       80.0         1.0          1.2   
...     ...  ...     ...     ...        ...         ...          ...   
991341    1   45     175      80       92.1         1.5          1.5   
991342    1   35     170      75       86.0         1.0          1.5   
991343    1   40     155      50       68.0         1.0          0.7   
991344    1   25     175      60       72.0         1.5          1.0   
991345    1   50     160      70       90.5         1.0          1.5   

        hear_left  hear_right    SBP  ...  LDL_chole  triglyceride  \
0             1.0         1.0  120.0  ...      126.0          92.

In [3]:
data = data.sample(n=1000, random_state=42)
print(data)

        sex  age  height  weight  waistline  sight_left  sight_right  \
676014    1   45     160      60       85.0         1.2          1.0   
958905    1   30     175      70       87.0         0.9          0.9   
472540    1   35     180      85       93.0         1.2          1.5   
244194    1   35     175      80       89.0         1.5          1.5   
88447     1   40     160      70       97.0         1.5          1.0   
...     ...  ...     ...     ...        ...         ...          ...   
503986    1   70     160      75       99.0         0.4          0.2   
123028    1   25     180      65       80.0         1.5          1.5   
808421    1   35     175      85       97.0         0.9          1.0   
131018    1   55     150      45       71.0         0.8          0.9   
475394    1   50     170      70       91.0         1.2          1.2   

        hear_left  hear_right    SBP  ...  LDL_chole  triglyceride  \
676014        1.0         1.0  113.0  ...      100.0          64.

In [4]:
X = data.drop('hemoglobin', axis=1)
y = data['DRK_YN']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
print(y_train)

0      1
1      0
2      1
3      1
4      0
      ..
795    0
796    0
797    1
798    1
799    1
Name: DRK_YN, Length: 800, dtype: int64


In [7]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
def knn_predict(X_train, y_train, X_test, k=3):
    predictions = []
    for x in X_test:
        distances = [np.linalg.norm(x - x_train) for x_train in X_train]
        k_indices = np.argsort(distances)[:k]
        k_nearest_labels = [y_train[i] for i in k_indices]
        most_common = np.bincount(k_nearest_labels).argmax()
        predictions.append(most_common)
    return predictions

In [9]:
k_value = 3
y_pred = knn_predict(X_train, y_train, X_test, k=k_value)

In [10]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Точность модели (k={k_value}): {accuracy:.2f}')

Точность модели (k=3): 0.97


In [11]:
train_class_distribution = y_train.value_counts()
test_class_distribution = y_test.value_counts()

print("Соотношение классов в обучающей выборке:")
print(train_class_distribution)

print("\nСоотношение классов в тестовой выборке:")
print(test_class_distribution)


Соотношение классов в обучающей выборке:
DRK_YN
1    408
0    392
Name: count, dtype: int64

Соотношение классов в тестовой выборке:
DRK_YN
0    104
1     96
Name: count, dtype: int64
