In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('training_data.csv', index_col=0) 
df

Unnamed: 0,ID,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,78830894-cdd8-43d1-9655-03db74141b7a,Female,80.0,0,1,never,25.19,6.6,140,0
1,32262c0e-903a-46b8-9061-b1a2f5a3e9b2,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,c4868b73-ca90-48ed-af14-3d1f78a5f030,Male,28.0,0,0,never,27.32,5.7,158,0
3,bec2ef91-5aff-48df-ac16-cb210b5f29fa,Female,36.0,0,0,current,23.45,5.0,155,0
4,1a6852a8-ee80-4d93-bea0-f0cdd941dc3d,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...,...
73713,5c890971-a6d7-44ce-a8a1-3b38cda7ef3b,Female,70.0,1,0,never,55.57,6.2,130,1
73714,13a8f177-1623-4acf-9db8-1e8ac8577c86,Male,60.0,0,0,No Info,27.32,6.6,160,0
73715,c9539d1b-075f-481f-8ae4-ab9bba75af5b,Male,65.0,0,0,former,32.07,5.0,160,0
73716,08eb5a8d-f577-440a-9408-c6d99ca17204,Male,64.0,0,0,current,30.23,6.2,158,0


# Описание задания

Вам необходимо построить модель-классификатор по определению диабета. У вас нет никаких ограничений по инструментам, новым полям и способу энкодинга данных.

# О данных

Датасет представляет собой набор медицинских и демографических данных пациентов, а также их диагноз по диабету (положительный или отрицательный). 

Данные включают такие характеристики, как возраст, пол, индекс массы тела (BMI), гипертония, болезни сердца, история курения, уровень HbA1c и уровень глюкозы в крови. Этот набор данных можно использовать для создания моделей машинного обучения для прогнозирования диабета у пациентов на основе их истории болезни и демографической информации.

In [3]:
df.isnull().count().any()


np.True_

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 73718 entries, 0 to 73717
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   73718 non-null  object 
 1   gender               73718 non-null  object 
 2   age                  73718 non-null  float64
 3   hypertension         73718 non-null  int64  
 4   heart_disease        73718 non-null  int64  
 5   smoking_history      73718 non-null  object 
 6   bmi                  73718 non-null  float64
 7   HbA1c_level          73718 non-null  float64
 8   blood_glucose_level  73718 non-null  int64  
 9   diabetes             73718 non-null  int64  
dtypes: float64(3), int64(4), object(3)
memory usage: 6.2+ MB


In [5]:
df_train = df.copy()
df_train = df_train.drop(columns='ID', axis = 1)
df_train = pd.get_dummies(df_train, columns=['gender', 'smoking_history'], drop_first=True, dtype=int)

df_train['diabetes'].value_counts(normalize=True) * 100


diabetes
0    91.114789
1     8.885211
Name: proportion, dtype: float64

In [6]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

X = df_train.drop('diabetes', axis=1)
y = df_train['diabetes']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

smote = SMOTE(random_state=42)
X_train_m, y_train_m = smote.fit_resample(X_train, y_train)

### Через Логистическую регрессию

In [7]:
from sklearn.preprocessing import StandardScaler

scalar = StandardScaler()
X_train_scalar = scalar.fit_transform(X_train_m)
X_test_scalar = scalar.transform(X_test)



In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

model = LogisticRegression(max_iter = 1000)
model.fit(X_train_scalar, y_train_m)

y_pred = model.predict(X_test_scalar)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred)) 

Accuracy: 0.8891074335322843
              precision    recall  f1-score   support

           0       0.98      0.90      0.94     13434
           1       0.43      0.80      0.56      1310

    accuracy                           0.89     14744
   macro avg       0.71      0.85      0.75     14744
weighted avg       0.93      0.89      0.90     14744

[[12057  1377]
 [  258  1052]]


### Random_forest


In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, RocCurveDisplay
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier(random_state=42, class_weight='balanced', n_jobs=-1)
param_grid = {
    'n_estimators': [100, 200, 300],      
    'max_depth': [None, 10, 20, 30],      
    'min_samples_split': [2, 5, 10],     
    'min_samples_leaf': [1, 2, 4],        
    'max_features': ['sqrt', 'log2']     
}

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,                  
    scoring='f1',       
    n_jobs=-1,           
    verbose=2               
)

grid_search.fit(X_train_m, y_train_m)

Fitting 3 folds for each of 216 candidates, totalling 648 fits


0,1,2
,estimator,RandomForestC...ndom_state=42)
,param_grid,"{'max_depth': [None, 10, ...], 'max_features': ['sqrt', 'log2'], 'min_samples_leaf': [1, 2, ...], 'min_samples_split': [2, 5, ...], ...}"
,scoring,'f1'
,n_jobs,-1
,refit,True
,cv,3
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,30
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [10]:
rf_model = grid_search.best_estimator_

y_pred2 = rf_model.predict(X_test)
y_prob_best = rf_model.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred2))
print(classification_report(y_test, y_pred2))
print(confusion_matrix(y_test, y_pred2)) 
print("\nROC-AUC Score:", roc_auc_score(y_test, y_prob_best))
print("Лучшие параметры:", grid_search.best_params_)

Accuracy: 0.9603228431904504
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     13434
           1       0.79      0.75      0.77      1310

    accuracy                           0.96     14744
   macro avg       0.88      0.87      0.87     14744
weighted avg       0.96      0.96      0.96     14744

[[13172   262]
 [  323   987]]

ROC-AUC Score: 0.9711307585742908
Лучшие параметры: {'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}


Добавим бустинг

In [11]:
from xgboost import XGBClassifier
import numpy as np

xgb_base = XGBClassifier(
    scale_pos_weight=10, 
    eval_metric='logloss',
    random_state=42,
    learning_rate=0.05,
    use_label_encoder=False
)

param_grid1 = {
    'n_estimators': [200, 400, 600],
    'max_depth': [4, 6, 8],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'min_child_weight': [1, 3, 5]
}

grid_search1 = GridSearchCV(
    estimator=xgb_base,
    param_grid=param_grid1,
    scoring='roc_auc',
    cv=3,             
    verbose=2,
    n_jobs=-1
)

grid_search1.fit(X_train, y_train)

Fitting 3 folds for each of 162 candidates, totalling 486 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,estimator,"XGBClassifier...ree=None, ...)"
,param_grid,"{'colsample_bytree': [0.8, 1.0], 'max_depth': [4, 6, ...], 'min_child_weight': [1, 3, ...], 'n_estimators': [200, 400, ...], ...}"
,scoring,'roc_auc'
,n_jobs,-1
,refit,True
,cv,3
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [12]:
best_xgb = grid_search.best_estimator_
print("\n Лучшие параметры:", grid_search.best_params_)

y_pred_best = best_xgb.predict(X_test)
y_prob_best = best_xgb.predict_proba(X_test)[:, 1]

print("\n Результаты лучшей модели XGBoost:")
print("Accuracy:", accuracy_score(y_test, y_pred_best))
print(classification_report(y_test, y_pred_best))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_best))
print("\nROC-AUC Score:", roc_auc_score(y_test, y_prob_best))


 Лучшие параметры: {'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}

 Результаты лучшей модели XGBoost:
Accuracy: 0.9603228431904504
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     13434
           1       0.79      0.75      0.77      1310

    accuracy                           0.96     14744
   macro avg       0.88      0.87      0.87     14744
weighted avg       0.96      0.96      0.96     14744

Confusion Matrix:
 [[13172   262]
 [  323   987]]

ROC-AUC Score: 0.9711307585742908


In [13]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix

log_reg = LogisticRegression(max_iter=1000, random_state=42)
rf = RandomForestClassifier(
    n_estimators=300, max_depth=30, random_state=42,
    max_features='sqrt', min_samples_split=2, min_samples_leaf=1
)
xgb = XGBClassifier(
    n_estimators=400, learning_rate=0.05, max_depth=6, subsample=0.9,
    scale_pos_weight=10, eval_metric='logloss', random_state=42
)

voting_clf = VotingClassifier(
    estimators=[('lr', log_reg), ('rf', rf), ('xgb', xgb)],
    voting='soft'
)

voting_clf.fit(X_train, y_train)

y_pred_vote = voting_clf.predict(X_test)
y_prob_vote = voting_clf.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred_vote))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_vote))
print(classification_report(y_test, y_pred_vote))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_vote))


Accuracy: 0.9700895279435703
ROC-AUC: 0.9800180014933058
              precision    recall  f1-score   support

           0       0.98      0.99      0.98     13434
           1       0.90      0.74      0.82      1310

    accuracy                           0.97     14744
   macro avg       0.94      0.87      0.90     14744
weighted avg       0.97      0.97      0.97     14744

Confusion matrix:
 [[13331   103]
 [  338   972]]


In [14]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score, roc_auc_score, classification_report, confusion_matrix,
    roc_curve, precision_recall_curve, auc
)
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

log_reg = LogisticRegression(max_iter=1000, random_state=42)
rf = RandomForestClassifier(
    n_estimators=300, max_depth=30, random_state=42,
    max_features='sqrt', min_samples_split=2, min_samples_leaf=1
)
xgb = XGBClassifier(
    n_estimators=400, learning_rate=0.05, max_depth=6,
    subsample=0.9, colsample_bytree=1.0, scale_pos_weight=10,
    eval_metric='logloss', random_state=42
)

meta_model = LogisticRegression(max_iter=1000, random_state=42)

stack_clf = StackingClassifier(
    estimators=[('lr', log_reg), ('rf', rf), ('xgb', xgb)],
    final_estimator=meta_model,
    cv=5,
    n_jobs=-1,
    passthrough=False
)

stack_clf.fit(X_train, y_train)

y_prob_stack = stack_clf.predict_proba(X_test)[:, 1]
y_pred_stack = stack_clf.predict(X_test)

print("\n Результаты STACKING:")
print("Accuracy:", accuracy_score(y_test, y_pred_stack))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_stack))
print(classification_report(y_test, y_pred_stack))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_stack))



 Результаты STACKING:
Accuracy: 0.9709712425393381
ROC-AUC: 0.9805070193322856
              precision    recall  f1-score   support

           0       0.98      0.99      0.98     13434
           1       0.91      0.74      0.82      1310

    accuracy                           0.97     14744
   macro avg       0.94      0.87      0.90     14744
weighted avg       0.97      0.97      0.97     14744

Confusion matrix:
 [[13342    92]
 [  336   974]]


In [15]:
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.metrics import precision_recall_curve, f1_score
import numpy as np

class ThresholdOptimizer(BaseEstimator, ClassifierMixin):
    def __init__(self, base_model, metric='f1'):
        self.base_model = base_model
        self.metric = metric
        self.best_threshold_ = 0.5

    def fit(self, X, y):
        self.model_ = clone(self.base_model)
        self.model_.fit(X, y)
        
        y_prob = self.model_.predict_proba(X)[:, 1]
        prec, rec, thr = precision_recall_curve(y, y_prob)
        f1_scores = 2 * (prec * rec) / (prec + rec)
        best_idx = np.argmax(f1_scores)
        
        self.best_threshold_ = thr[best_idx]
        print(f"P-value: {self.best_threshold_:.2f}")
        print(f"Precision={prec[best_idx]:.3f}, Recall={rec[best_idx]:.3f}, F1={f1_scores[best_idx]:.3f}")
        return self

    def predict(self, X):
        y_prob = self.model_.predict_proba(X)[:, 1]
        return (y_prob >= self.best_threshold_).astype(int)

    def predict_proba(self, X):
        return self.model_.predict_proba(X)


In [16]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

log_reg = LogisticRegression(max_iter=1000, random_state=42)
rf = RandomForestClassifier(n_estimators=300, max_depth=30, random_state=42)
xgb = XGBClassifier(n_estimators=400, learning_rate=0.05, max_depth=6, subsample=0.9, scale_pos_weight=10, eval_metric='logloss', random_state=42)

stack_clf = StackingClassifier(
    estimators=[('lr', log_reg), ('rf', rf), ('xgb', xgb)],
    final_estimator=LogisticRegression(max_iter=1000, random_state=42),
    cv=5,
    n_jobs=-1
)

smart_model = ThresholdOptimizer(stack_clf)
smart_model.fit(X_train, y_train)

y_pred = smart_model.predict(X_test)
y_prob = smart_model.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))
print(classification_report(y_test, y_pred))


P-value: 0.37
Precision=0.974, Recall=0.969, F1=0.971
Accuracy: 0.9692078133478025
ROC-AUC: 0.9805070193322856
              precision    recall  f1-score   support

           0       0.98      0.99      0.98     13434
           1       0.85      0.79      0.82      1310

    accuracy                           0.97     14744
   macro avg       0.92      0.89      0.90     14744
weighted avg       0.97      0.97      0.97     14744



Precision=0.974, Recall=0.969, F1=0.971
Accuracy: 0.9692078133478025
ROC-AUC: 0.9805070193322856
              precision    recall  f1-score   support

           0       0.98      0.99      0.98     13434
           1       0.85      0.79      0.82      1310

    accuracy                           0.97     14744
   macro avg       0.92      0.89      0.90     14744
weighted avg       0.97      0.97      0.97     14744

# Submission задания

Вам будет предоставлен второй датасет, без целевой переменной (таргет - diabetes)
Этот датасет необходимо будет проскорить и сабмитнуть в Google Classroom в формате .csv, с 2 стобцами: `ID` и `prediction`

Поле `prediction` должно быть предсказанием **<span style = 'color : green'> класса </span>** (`predict`), т.е. 1 или 0 а **<span style = 'color : red'>не вероятности </span>** (`predict_proba`)

In [17]:
df_test = pd.read_csv('test_data.csv', index_col=0)

In [18]:
df_test.gender.unique()

array(['Female', 'Male', 'Other'], dtype=object)

In [19]:
df_result = pd.DataFrame()
df_result['ID'] = df_test['ID']

df_result

Unnamed: 0,ID
0,ba0d6256-ca6e-46a7-a2fb-2518a8937fd7
1,7102b1ab-c7a4-4a43-b1fc-cb588d0553b8
2,51ba207e-4934-41b4-a31c-cd275c5ca555
3,7e17faaa-0402-4728-9719-83982eba3afd
4,08b4e515-286e-4393-9a55-138962f9546e
...,...
26141,1f8db6fe-03f8-48ca-9a42-9f12062b002d
26142,5bf3ca1a-ced5-474b-a354-cc3bec9f16d3
26143,930e6345-a920-47dd-abd0-6bc0be2ae56f
26144,035f3cc4-e8af-448a-8172-a1b027e06de1


In [20]:
df_test = df_test.drop(columns='ID', axis=1)
df_test = pd.get_dummies(df_test, columns=['gender', 'smoking_history'], drop_first=True, dtype=int)

df_test

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,gender_Male,gender_Other,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,60.0,1,0,23.77,4.8,130,0,0,0,0,1,0,0
1,2.0,0,0,15.19,5.0,159,1,0,0,0,0,0,0
2,74.0,0,0,39.59,6.5,160,0,0,0,0,0,0,1
3,51.0,0,0,26.67,6.6,80,0,0,0,0,0,1,0
4,35.0,0,0,26.09,3.5,159,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26141,31.0,0,0,27.32,6.1,140,0,0,0,0,0,1,0
26142,65.0,1,1,27.25,5.7,100,0,0,0,0,0,1,0
26143,30.0,1,0,57.17,5.8,300,1,0,0,0,1,0,0
26144,20.0,0,0,32.70,6.2,159,1,0,0,0,0,1,0


In [21]:
y_pred_final = smart_model.predict(df_test)
y_pred_final

array([0, 0, 1, ..., 1, 0, 0], shape=(26146,))

In [22]:
df_result['prediction'] = y_pred_final
df_result

Unnamed: 0,ID,prediction
0,ba0d6256-ca6e-46a7-a2fb-2518a8937fd7,0
1,7102b1ab-c7a4-4a43-b1fc-cb588d0553b8,0
2,51ba207e-4934-41b4-a31c-cd275c5ca555,1
3,7e17faaa-0402-4728-9719-83982eba3afd,0
4,08b4e515-286e-4393-9a55-138962f9546e,0
...,...,...
26141,1f8db6fe-03f8-48ca-9a42-9f12062b002d,0
26142,5bf3ca1a-ced5-474b-a354-cc3bec9f16d3,0
26143,930e6345-a920-47dd-abd0-6bc0be2ae56f,1
26144,035f3cc4-e8af-448a-8172-a1b027e06de1,0


In [23]:
df_result.to_csv('prediction.csv', index=False)