In [1]:
import numpy as np
from scipy.sparse import load_npz
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [2]:
# Загрузка данных
train = load_npz("data/train.npz")
X = train.toarray()

# Стандартизация
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Тензоры
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
dataset = TensorDataset(X_tensor)
loader = DataLoader(dataset, batch_size=256, shuffle=True)

In [3]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(Autoencoder, self).__init__()
        
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, latent_dim)
        )
        
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, input_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        x_recon = self.decoder(z)
        return z, x_recon

In [4]:
input_dim = X_scaled.shape[1]
latent_dim = 64

autoencoder = Autoencoder(input_dim, latent_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters(), lr=0.001)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
autoencoder.to(device)

# Предобучение
for epoch in range(50):
    for batch in loader:
        inputs = batch[0].to(device)

        _, reconstructions = autoencoder(inputs)
        loss = criterion(reconstructions, inputs)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Pretraining Epoch [{epoch+1}/50], Loss: {loss.item():.4f}")

Pretraining Epoch [1/50], Loss: 0.7369
Pretraining Epoch [2/50], Loss: 0.7522
Pretraining Epoch [3/50], Loss: 0.8757
Pretraining Epoch [4/50], Loss: 0.7821
Pretraining Epoch [5/50], Loss: 0.7413
Pretraining Epoch [6/50], Loss: 0.7602
Pretraining Epoch [7/50], Loss: 0.7709
Pretraining Epoch [8/50], Loss: 0.7339
Pretraining Epoch [9/50], Loss: 0.7373
Pretraining Epoch [10/50], Loss: 0.7200
Pretraining Epoch [11/50], Loss: 0.7338
Pretraining Epoch [12/50], Loss: 0.7159
Pretraining Epoch [13/50], Loss: 0.7089
Pretraining Epoch [14/50], Loss: 0.7864
Pretraining Epoch [15/50], Loss: 0.7694
Pretraining Epoch [16/50], Loss: 0.7515
Pretraining Epoch [17/50], Loss: 0.7566
Pretraining Epoch [18/50], Loss: 0.7647
Pretraining Epoch [19/50], Loss: 0.7474
Pretraining Epoch [20/50], Loss: 0.6993
Pretraining Epoch [21/50], Loss: 0.6934
Pretraining Epoch [22/50], Loss: 0.7363
Pretraining Epoch [23/50], Loss: 0.7144
Pretraining Epoch [24/50], Loss: 0.7430
Pretraining Epoch [25/50], Loss: 0.7318
Pretraini

In [5]:
autoencoder.eval()
with torch.no_grad():
    latent = autoencoder.encoder(X_tensor).numpy()

# K-Means
n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_centers = kmeans.fit(latent).cluster_centers_
labels = kmeans.labels_

# Сохраняем центры как параметр модели
cluster_centers_torch = torch.tensor(cluster_centers, dtype=torch.float32, device=device)

In [6]:
def target_distribution(q):
    weight = q ** 2 / q.sum(0)
    return (weight.t() / weight.sum(1)).t()

In [8]:
import torch.nn.functional as F  # <- добавляем F

# DEC Training
for epoch in range(10):
    for batch in loader:
        inputs = batch[0].to(device)

        z, _ = autoencoder(inputs)

        # Вычисляем мягкое принадлежность к кластерам
        q = 1.0 / (1.0 + torch.sum((z.unsqueeze(1) - cluster_centers_torch) ** 2, dim=2) / 1)
        q = q.pow(2)
        q = q / q.sum(1, keepdim=True)

        p = target_distribution(q)

        # KL Divergence Loss
        loss = F.kl_div(q.log(), p, reduction='batchmean')  # <- ИСПРАВЛЕНО!

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"DEC Epoch [{epoch+1}/10], Loss: {loss.item():.4f}")

DEC Epoch [1/10], Loss: 0.0008
DEC Epoch [2/10], Loss: 0.0003
DEC Epoch [3/10], Loss: 0.0002
DEC Epoch [4/10], Loss: 0.0003
DEC Epoch [5/10], Loss: 0.0001
DEC Epoch [6/10], Loss: 0.0001
DEC Epoch [7/10], Loss: 0.0001
DEC Epoch [8/10], Loss: 0.0001
DEC Epoch [9/10], Loss: 0.0001
DEC Epoch [10/10], Loss: 0.0001


In [9]:
autoencoder.eval()
with torch.no_grad():
    z_final = autoencoder.encoder(X_tensor).numpy()

labels_final = kmeans.fit_predict(z_final)

In [10]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

print(f"Silhouette Score: {silhouette_score(z_final, labels_final):.4f}")
print(f"Calinski-Harabasz Score: {calinski_harabasz_score(z_final, labels_final):.2f}")
print(f"Davies-Bouldin Score: {davies_bouldin_score(z_final, labels_final):.4f}")

Silhouette Score: 0.1355
Calinski-Harabasz Score: 3992.52
Davies-Bouldin Score: 2.0311


In [11]:

import pandas as pd

submission = pd.DataFrame({
    "ID": range(len(labels_final)),
    "TARGET": labels_final
})

submission.to_csv("submission_submit_DEC.csv", index=False)