In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier



In [30]:
# Загрузить данные
data = pd.read_csv('diabetes.csv')

# Посмотрим на первые несколько строк данных
print(data.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [31]:
# Распределение по классам
class_distribution = data['Outcome'].value_counts()
print(class_distribution)

Outcome
0    500
1    268
Name: count, dtype: int64


In [32]:
# Разделение данных на X (признаки) и y (ответы)
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Разделение на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
# Обучение модели логистической регрессии
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

# Оценка модели
y_pred = logreg.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.79      0.80        99
           1       0.64      0.67      0.65        55

    accuracy                           0.75       154
   macro avg       0.73      0.73      0.73       154
weighted avg       0.75      0.75      0.75       154



In [34]:
# Обучение модели SVM
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)

# Оценка модели
y_pred_svm = svm.predict(X_test)
print(classification_report(y_test, y_pred_svm))

              precision    recall  f1-score   support

           0       0.81      0.81      0.81        99
           1       0.65      0.65      0.65        55

    accuracy                           0.75       154
   macro avg       0.73      0.73      0.73       154
weighted avg       0.75      0.75      0.75       154



In [35]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Обучение модели на стандартизированных данных
logreg.fit(X_train_scaled, y_train)
y_pred_scaled = logreg.predict(X_test_scaled)
print(classification_report(y_test, y_pred_scaled))

              precision    recall  f1-score   support

           0       0.81      0.80      0.81        99
           1       0.65      0.67      0.66        55

    accuracy                           0.75       154
   macro avg       0.73      0.74      0.73       154
weighted avg       0.76      0.75      0.75       154



In [36]:
# Перекрестная проверка для логистической регрессии
scores_logreg = cross_val_score(logreg, X_train_scaled, y_train, cv=5, scoring='accuracy')
print("Average cross-validated accuracy (Logistic Regression):", scores_logreg.mean())

Average cross-validated accuracy (Logistic Regression): 0.7606290817006529


In [37]:
# Для SVM
svm_param_grid = {'kernel': ['linear', 'rbf', 'poly'], 'C': [0.1, 1, 10]}
grid_search_svm = GridSearchCV(SVC(), svm_param_grid, cv=5, scoring='accuracy')
grid_search_svm.fit(X_train_scaled, y_train)

print("Best params for SVM:", grid_search_svm.best_params_)

# Для логистической регрессии
logreg_param_grid = {'C': [0.1, 1, 10]}
grid_search_logreg = GridSearchCV(LogisticRegression(max_iter=1000), logreg_param_grid, cv=5, scoring='accuracy')
grid_search_logreg.fit(X_train_scaled, y_train)

print("Best params for Logistic Regression:", grid_search_logreg.best_params_)

Best params for SVM: {'C': 1, 'kernel': 'rbf'}
Best params for Logistic Regression: {'C': 10}


In [38]:
# Подбор k
knn_param_grid = {'n_neighbors': range(1, 30)}
grid_search_knn = GridSearchCV(KNeighborsClassifier(), knn_param_grid, cv=5, scoring='accuracy')
grid_search_knn.fit(X_train_scaled, y_train)

print("Best k for KNN:", grid_search_knn.best_params_)

Best k for KNN: {'n_neighbors': 11}


In [39]:
# Обучение модели дерева решений
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)

# Оценка модели
y_pred_tree = tree.predict(X_test)
print(classification_report(y_test, y_pred_tree))

# Подбор max_depth
tree_param_grid = {'max_depth': range(1, 20)}
grid_search_tree = GridSearchCV(DecisionTreeClassifier(random_state=42), tree_param_grid, cv=5, scoring='accuracy')
grid_search_tree.fit(X_train, y_train)

print("Best max_depth for Decision Tree:", grid_search_tree.best_params_)

# Обучение модели с оптимальным max_depth
optimal_tree = DecisionTreeClassifier(max_depth=grid_search_tree.best_params_['max_depth'], random_state=42)
optimal_tree.fit(X_train, y_train)
y_pred_optimal_tree = optimal_tree.predict(X_test)
print(classification_report(y_test, y_pred_optimal_tree))

              precision    recall  f1-score   support

           0       0.83      0.76      0.79        99
           1       0.62      0.73      0.67        55

    accuracy                           0.75       154
   macro avg       0.73      0.74      0.73       154
weighted avg       0.76      0.75      0.75       154

Best max_depth for Decision Tree: {'max_depth': 3}
              precision    recall  f1-score   support

           0       0.80      0.84      0.82        99
           1       0.68      0.62      0.65        55

    accuracy                           0.76       154
   macro avg       0.74      0.73      0.73       154
weighted avg       0.76      0.76      0.76       154



В среднем все модели справляются одинаково