In [2]:
import pandas as pd

# Загружаем файл
file_path = 'glucose_dataset.csv'
data = pd.read_csv(file_path)

# Просмотрим первые несколько строк данных
data.head()

Unnamed: 0.1,Unnamed: 0,0:00,0:05,0:10,0:15,0:20,0:25,0:30,0:35,0:40,...,5:15,5:20,5:25,5:30,5:35,5:40,5:45,5:50,5:55,hypo
0,patient_0,9.66,9.44,9.33,9.21,9.21,9.1,9.21,9.21,9.1,...,8.44,8.44,8.55,8.55,8.55,8.55,8.55,8.55,8.55,0
1,patient_1,13.65,13.77,13.65,13.54,13.43,13.21,12.88,12.43,12.1,...,10.55,10.44,10.44,10.44,10.44,10.32,10.32,10.32,10.32,0
2,patient_2,9.77,9.99,9.99,9.88,9.66,9.55,9.44,9.33,9.21,...,6.77,6.88,6.88,6.99,6.99,6.77,6.66,6.44,6.33,0
3,patient_3,6.61,6.83,7.05,7.1,6.99,6.83,6.66,6.49,6.33,...,4.61,4.88,5.05,5.16,5.16,5.16,5.22,5.27,5.33,0
4,patient_4,5.11,4.94,4.83,4.88,5.0,5.05,5.05,5.0,5.05,...,8.27,8.33,8.27,8.16,8.05,7.94,7.83,7.72,7.55,0


In [4]:
# Исключим столбец с именами пациентов (Unnamed: 0) и столбец 'hypo'
numeric_data = data.drop(columns=['Unnamed: 0', 'hypo'])

# Заполним пропуски линейной интерполяцией только для числовых данных
numeric_data_interpolated = numeric_data.interpolate(method='linear', axis=1)

# Добавим обратно столбец 'hypo' для дальнейшей работы
data_interpolated = pd.concat([numeric_data_interpolated, data['hypo']], axis=1)

# Выделим пациентов без эпизодов гипогликемии (hypo=0)
patients_no_hypo = data_interpolated[data_interpolated['hypo'] == 0]

# Посмотрим на данные после обработки
patients_no_hypo.head()

Unnamed: 0,0:00,0:05,0:10,0:15,0:20,0:25,0:30,0:35,0:40,0:45,...,5:15,5:20,5:25,5:30,5:35,5:40,5:45,5:50,5:55,hypo
0,9.66,9.44,9.33,9.21,9.21,9.1,9.21,9.21,9.1,9.1,...,8.44,8.44,8.55,8.55,8.55,8.55,8.55,8.55,8.55,0
1,13.65,13.77,13.65,13.54,13.43,13.21,12.88,12.43,12.1,11.88,...,10.55,10.44,10.44,10.44,10.44,10.32,10.32,10.32,10.32,0
2,9.77,9.99,9.99,9.88,9.66,9.55,9.44,9.33,9.21,9.1,...,6.77,6.88,6.88,6.99,6.99,6.77,6.66,6.44,6.33,0
3,6.61,6.83,7.05,7.1,6.99,6.83,6.66,6.49,6.33,6.22,...,4.61,4.88,5.05,5.16,5.16,5.16,5.22,5.27,5.33,0
4,5.11,4.94,4.83,4.88,5.0,5.05,5.05,5.0,5.05,5.27,...,8.27,8.33,8.27,8.16,8.05,7.94,7.83,7.72,7.55,0


In [5]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score

# Используем только числовые данные для кластеризации
X = patients_no_hypo.drop(columns=['hypo'])

# Иерархическая кластеризация (AgglomerativeClustering)
# Подбираем оптимальное количество кластеров (от 2 до 10)
best_silhouette = -1
best_n_clusters = 2
for n_clusters in range(2, 11):
    clustering = AgglomerativeClustering(n_clusters=n_clusters)
    labels = clustering.fit_predict(X)
    silhouette_avg = silhouette_score(X, labels)
    if silhouette_avg > best_silhouette:
        best_silhouette = silhouette_avg
        best_n_clusters = n_clusters

# Выведем результаты
best_silhouette, best_n_clusters

(0.3613947407704182, 2)

In [6]:
from sklearn.cluster import KMeans

# Подбираем оптимальное количество кластеров для K-means (от 2 до 10)
best_silhouette_kmeans = -1
best_n_clusters_kmeans = 2
for n_clusters in range(2, 11):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(X)
    silhouette_avg = silhouette_score(X, labels)
    if silhouette_avg > best_silhouette_kmeans:
        best_silhouette_kmeans = silhouette_avg
        best_n_clusters_kmeans = n_clusters

# Выведем результаты
best_silhouette_kmeans, best_n_clusters_kmeans

found 0 physical cores < 1
  File "C:\Users\snytk\miniconda3\envs\python311\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


(0.48705479967977183, 2)

In [11]:
from sklearn.cluster import DBSCAN

# Подбираем оптимальные параметры для DBSCAN (eps, min_samples)
best_silhouette_dbscan = -1
best_eps = 0.1
best_min_samples = 5

# Пробуем различные значения eps и min_samples
for eps in [0.1, 0.2, 0.3, 0.4, 0.5]:
    for min_samples in [3, 4, 5, 6]:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        labels = dbscan.fit_predict(X)
        if len(set(labels)) > 1:  # Только если есть хотя бы 2 кластера
            silhouette_avg = silhouette_score(X, labels)
            if silhouette_avg > best_silhouette_dbscan:
                best_silhouette_dbscan = silhouette_avg
                best_eps = eps
                best_min_samples = min_samples

# Выведем результаты
best_silhouette_dbscan, best_eps, best_min_samples

(-1, 0.1, 5)

In [12]:
from sklearn.preprocessing import scale

# Стандартизация данных для пациентов без гипогликемии
X_no_hypo_scaled = scale(patients_no_hypo.drop(columns=['hypo']))

# Посмотрим на результат стандартизации (первые несколько строк)
pd.DataFrame(X_no_hypo_scaled).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,62,63,64,65,66,67,68,69,70,71
0,0.207794,0.142675,0.11846,0.088963,0.099031,0.07057,0.115312,0.1211,0.089015,0.094944,...,-0.049572,-0.011179,-0.011215,0.02656,0.027448,0.027539,0.028346,0.031068,0.032587,0.033119
1,1.514872,1.563505,1.546699,1.528408,1.509849,1.452441,1.353628,1.211048,1.107726,1.041475,...,0.709727,0.710506,0.672218,0.67046,0.669003,0.666154,0.624135,0.625475,0.624697,0.622294
2,0.243828,0.32315,0.336663,0.311695,0.249473,0.22187,0.192918,0.161719,0.126368,0.094944,...,-0.583133,-0.58237,-0.544292,-0.542389,-0.502089,-0.499572,-0.570809,-0.603639,-0.673263,-0.705846
3,-0.791351,-0.713761,-0.635333,-0.612475,-0.643153,-0.692653,-0.745098,-0.799602,-0.851595,-0.885634,...,-1.438199,-1.321156,-1.227724,-1.165848,-1.123277,-1.117913,-1.112742,-1.087224,-1.064658,-1.038713
4,-1.282734,-1.333939,-1.369289,-1.350481,-1.308444,-1.291128,-1.288338,-1.303958,-1.286244,-1.209089,...,-0.090615,-0.069324,-0.048803,-0.068833,-0.104936,-0.141407,-0.176983,-0.210725,-0.24507,-0.299748


In [15]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score

# Используем только числовые данные для кластеризации
X = pd.DataFrame(X_no_hypo_scaled)

# Иерархическая кластеризация (AgglomerativeClustering)
# Подбираем оптимальное количество кластеров (от 2 до 10)
best_silhouette = -1
best_n_clusters = 2
for n_clusters in range(2, 11):
    clustering = AgglomerativeClustering(n_clusters=n_clusters)
    labels = clustering.fit_predict(X)
    silhouette_avg = silhouette_score(X, labels)
    if silhouette_avg > best_silhouette:
        best_silhouette = silhouette_avg
        best_n_clusters = n_clusters

# Выведем результаты
best_silhouette, best_n_clusters

(0.5051134895094076, 2)

In [16]:
from sklearn.cluster import KMeans

# Подбираем оптимальное количество кластеров для K-means (от 2 до 10)
best_silhouette_kmeans = -1
best_n_clusters_kmeans = 2
for n_clusters in range(2, 11):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(X)
    silhouette_avg = silhouette_score(X, labels)
    if silhouette_avg > best_silhouette_kmeans:
        best_silhouette_kmeans = silhouette_avg
        best_n_clusters_kmeans = n_clusters

# Выведем результаты
best_silhouette_kmeans, best_n_clusters_kmeans

(0.48842347347139226, 2)

In [17]:
from sklearn.cluster import DBSCAN

# Подбираем оптимальные параметры для DBSCAN (eps, min_samples)
best_silhouette_dbscan = -1
best_eps = 0.1
best_min_samples = 5

# Пробуем различные значения eps и min_samples
for eps in [0.1, 0.2, 0.3, 0.4, 0.5]:
    for min_samples in [3, 4, 5, 6]:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        labels = dbscan.fit_predict(X)
        if len(set(labels)) > 1:  # Только если есть хотя бы 2 кластера
            silhouette_avg = silhouette_score(X, labels)
            if silhouette_avg > best_silhouette_dbscan:
                best_silhouette_dbscan = silhouette_avg
                best_eps = eps
                best_min_samples = min_samples

# Выведем результаты
best_silhouette_dbscan, best_eps, best_min_samples

(-0.18464492886460662, 0.5, 4)

In [18]:
patients_with_hypo = data_interpolated[data_interpolated['hypo'] == 1]

# Стандартизация данных для пациентов с гипогликемией
X_hypo_scaled = scale(patients_with_hypo.drop(columns=['hypo']))

# Проверим первые несколько строк стандартизированных данных
pd.DataFrame(X_hypo_scaled).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,62,63,64,65,66,67,68,69,70,71
0,-0.07497,-0.154737,-0.223178,-0.290717,-0.313306,-0.394332,-0.430053,-0.464488,-0.548913,-0.582945,...,0.079046,-0.004414,-0.082084,-0.118024,-0.154794,-0.156247,-0.198203,-0.243481,-0.320234,-0.361497
1,0.07064,0.085282,0.019401,-0.146637,-0.264682,-0.3449,-0.375578,-0.359314,-0.34143,-0.323124,...,-0.927305,-0.919504,-0.909964,-0.864246,-0.854818,-0.806185,-0.798949,-0.719779,-0.635377,-0.555144
2,0.353294,0.226722,-0.03258,-0.242691,-0.361931,-0.448258,-0.529922,-0.565089,-0.599631,-0.685018,...,-0.256404,-0.251939,-0.246168,-0.241151,-0.236278,-0.236577,-0.277343,-0.365241,-0.320234,-0.361497
3,-0.974326,-1.011948,-1.041881,-1.067876,-1.051512,-1.095363,-1.083746,-1.122967,-1.111421,-1.097949,...,-0.339325,-0.334447,-0.32821,-0.368008,-0.402951,-0.441052,-0.439221,-0.483421,-0.518526,-0.477685
4,2.066353,2.035436,2.038004,1.988366,1.941093,1.897498,1.839715,1.771598,1.705733,1.630178,...,-0.757695,-0.709484,-0.537045,-0.491135,-0.525177,-0.521381,-0.478791,-0.483421,-0.479576,-0.555144


In [20]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score

# Используем только числовые данные для кластеризации
X = pd.DataFrame(X_hypo_scaled)

# Иерархическая кластеризация (AgglomerativeClustering)
# Подбираем оптимальное количество кластеров (от 2 до 10)
best_silhouette = -1
best_n_clusters = 2
for n_clusters in range(2, 11):
    clustering = AgglomerativeClustering(n_clusters=n_clusters)
    labels = clustering.fit_predict(X)
    silhouette_avg = silhouette_score(X, labels)
    if silhouette_avg > best_silhouette:
        best_silhouette = silhouette_avg
        best_n_clusters = n_clusters

# Выведем результаты
best_silhouette, best_n_clusters

(0.3613483281343443, 2)

In [21]:
from sklearn.cluster import KMeans

# Подбираем оптимальное количество кластеров для K-means (от 2 до 10)
best_silhouette_kmeans = -1
best_n_clusters_kmeans = 2
for n_clusters in range(2, 11):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(X)
    silhouette_avg = silhouette_score(X, labels)
    if silhouette_avg > best_silhouette_kmeans:
        best_silhouette_kmeans = silhouette_avg
        best_n_clusters_kmeans = n_clusters

# Выведем результаты
best_silhouette_kmeans, best_n_clusters_kmeans

(0.466879982402296, 2)

In [22]:
from sklearn.cluster import DBSCAN

# Подбираем оптимальные параметры для DBSCAN (eps, min_samples)
best_silhouette_dbscan = -1
best_eps = 0.1
best_min_samples = 5

# Пробуем различные значения eps и min_samples
for eps in [0.1, 0.2, 0.3, 0.4, 0.5]:
    for min_samples in [3, 4, 5, 6]:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        labels = dbscan.fit_predict(X)
        if len(set(labels)) > 1:  # Только если есть хотя бы 2 кластера
            silhouette_avg = silhouette_score(X, labels)
            if silhouette_avg > best_silhouette_dbscan:
                best_silhouette_dbscan = silhouette_avg
                best_eps = eps
                best_min_samples = min_samples

# Выведем результаты
best_silhouette_dbscan, best_eps, best_min_samples

(-1, 0.1, 5)