In [79]:
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import itertools

In [80]:
def preprocess_data(data):
    # Добавляем индексы к исходным данным
    indices = np.arange(data.shape[0]).reshape(-1, 1)
    indices = np.repeat(indices, data.shape[1], axis=1)  # Повторяем индексы для всех временных шагов
    indices = indices[:, :, np.newaxis]
    data_with_indices = np.concatenate([data, indices], axis=2)
 
    # Cтроки с пропусками 
    schaefer_mask = np.isnan(data_with_indices[:, :, :-1]).any(axis=2)
    
    # Cтроки без пропусков (Brainnetome)
    brainnetome_data = data_with_indices[~schaefer_mask.any(axis=1)]
    
    # Schaefer200
    schaefer_data = data_with_indices[schaefer_mask.any(axis=1)]
    
    # Сохраняем индексы для восстановления
    brainnetome_indices = brainnetome_data[:, 0, -1]  
    schaefer_indices = schaefer_data[:, 0, -1]
    
    # Убираем столбец с индексами
    brainnetome_data = brainnetome_data[:, :, :-1]
    schaefer_data = schaefer_data[:, :, :-1]
    
    # Из Schaefer200 нужны только первые 200 признаков
    schaefer_data = schaefer_data[:, :, :200]

    # Нормализация данных
    scaler = StandardScaler()
    
    # Применяем нормализацию к каждому временному шагу
    brainnetome_data = scaler.fit_transform(brainnetome_data.reshape(-1, brainnetome_data.shape[2])).reshape(brainnetome_data.shape)
    schaefer_data = scaler.fit_transform(schaefer_data.reshape(-1, schaefer_data.shape[2])).reshape(schaefer_data.shape)

    # PCA для приведения к одинаковому размеру
    pca = PCA(n_components=120)
    
    # Применяем PCA отдельно для каждого временного шага (поэтому нужно развернуть массив)
    brainnetome_data = pca.fit_transform(brainnetome_data.reshape(-1, brainnetome_data.shape[2])).reshape(brainnetome_data.shape[0], brainnetome_data.shape[1], 120)
    schaefer_data = pca.fit_transform(schaefer_data.reshape(-1, schaefer_data.shape[2])).reshape(schaefer_data.shape[0], schaefer_data.shape[1], 120)
    
    # Объединение данных по исходным индексам
    combined_data = np.zeros((data.shape[0], data.shape[1], 120))  
    combined_data[brainnetome_indices.astype(int)] = brainnetome_data
    combined_data[schaefer_indices.astype(int)] = schaefer_data
    
    return combined_data



In [88]:
def cluster_and_label(data, scaler=None, n=None, init=None, algorithm=None, n_clusters=20):
    common_brain_region_data = data[:, :, 0] 
    model = make_pipeline(
        scaler,
        MiniBatchKMeans(n_clusters=21, random_state=42, init='random')
    )

    model.fit(common_brain_region_data)
    cluster_distances = model.transform(common_brain_region_data)
    labeling = np.zeros(len(data), dtype=int)
    leftover_indexes = np.arange(len(data))

    for i in range(n_clusters):
        distances_from_current_cluster_center = cluster_distances[:, i]
        if len(distances_from_current_cluster_center) > 16:
            top16 = np.argpartition(distances_from_current_cluster_center, 16)[:16]
            labeling[leftover_indexes[top16]] = i
            cluster_distances = np.delete(cluster_distances, top16, axis=0)
            leftover_indexes = np.delete(leftover_indexes, top16)
        else:
            labeling[leftover_indexes] = i

    # Оценка (коэффициент силуэта)
    silhouette_avg = silhouette_score(common_brain_region_data, labeling)
    results['scaler'].append(scaler)
    results['n_clusters'].append(n)
    results['metric'].append(silhouette_avg)
    results['inits'].append(init)
    results['algorithms'].append(algorithm)
    return labeling, results

In [82]:
scalers = [StandardScaler(), MinMaxScaler()]
param_model = {
    'n_clusters': [i for i in range(3, 100, 2)],
    'inits': ['k-means++', 'random'],
    'algorithms': ['lloyd', 'elkan']
}

In [83]:
results = {
    'scaler': [],
    'n_clusters': [],
    'metric': [],
    'inits': [],
    'algorithms': []
}

In [89]:
data = np.load('../data/ts_cut/ihb.npy')
combined_data = preprocess_data(data)
np.info(combined_data)
'''
for scaler, n, init in itertools.product(scalers, param_model['n_clusters'], param_model['inits']):
    labeling, results = cluster_and_label(combined_data, scaler, n, init, n_clusters=n)
index = np.argmax(results['metric']) 
print(f"scaler = {results['scaler'][index]},\nn_clusters = {results['n_clusters'][index]},\nmetric = {results['metric'][index]},\ninit = {results['inits'][index]}\n\n\n")
'''
labeling, _ = cluster_and_label(combined_data)
pd.DataFrame({'prediction': labeling}).to_csv('../submission_batchkmeans.csv', index=False)

class:  ndarray
shape:  (320, 10, 120)
strides:  (9600, 960, 8)
itemsize:  8
aligned:  True
contiguous:  True
fortran:  False
data pointer: 0x2382c4e4040
byteorder:  little
byteswap:  False
type: float64
