In [1]:
import numpy as np
from scipy.sparse import load_npz
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [3]:
# Загрузка данных
train = load_npz("data/train.npz")
X = train.toarray()

# Стандартизация
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Преобразование в тензоры
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
dataset = TensorDataset(X_tensor)
loader = DataLoader(dataset, batch_size=256, shuffle=True)

In [4]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(Autoencoder, self).__init__()
        
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, latent_dim)
        )
        
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, input_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        x_recon = self.decoder(z)
        return z, x_recon

In [5]:
input_dim = X_scaled.shape[1]
latent_dim = 64  # Можно менять

autoencoder = Autoencoder(input_dim, latent_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters(), lr=0.001)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
autoencoder.to(device)

# Обучение
for epoch in range(50):  # Эпохи
    for batch in loader:
        inputs = batch[0].to(device)

        # Forward pass
        _, reconstructions = autoencoder(inputs)
        loss = criterion(reconstructions, inputs)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/50], Loss: {loss.item():.4f}')

Epoch [1/50], Loss: 0.8844
Epoch [2/50], Loss: 0.7551
Epoch [3/50], Loss: 0.7868
Epoch [4/50], Loss: 0.7363
Epoch [5/50], Loss: 0.7402
Epoch [6/50], Loss: 0.7643
Epoch [7/50], Loss: 0.7442
Epoch [8/50], Loss: 0.7541
Epoch [9/50], Loss: 0.7453
Epoch [10/50], Loss: 0.7806
Epoch [11/50], Loss: 0.7542
Epoch [12/50], Loss: 0.7522
Epoch [13/50], Loss: 0.7472
Epoch [14/50], Loss: 0.7155
Epoch [15/50], Loss: 0.7573
Epoch [16/50], Loss: 0.7272
Epoch [17/50], Loss: 0.7421
Epoch [18/50], Loss: 0.7357
Epoch [19/50], Loss: 0.7403
Epoch [20/50], Loss: 0.7552
Epoch [21/50], Loss: 0.7443
Epoch [22/50], Loss: 0.7092
Epoch [23/50], Loss: 0.7025
Epoch [24/50], Loss: 0.7054
Epoch [25/50], Loss: 0.7629
Epoch [26/50], Loss: 0.7118
Epoch [27/50], Loss: 0.6988
Epoch [28/50], Loss: 0.7056
Epoch [29/50], Loss: 0.7318
Epoch [30/50], Loss: 0.6825
Epoch [31/50], Loss: 0.7045
Epoch [32/50], Loss: 0.7429
Epoch [33/50], Loss: 0.7005
Epoch [34/50], Loss: 0.7030
Epoch [35/50], Loss: 0.7017
Epoch [36/50], Loss: 0.7317
E

In [6]:
autoencoder.eval()
with torch.no_grad():
    latent_representations = autoencoder.encoder(X_tensor).numpy()

In [7]:
from sklearn.cluster import KMeans

n_clusters = 5  # На основе sample_submission.csv
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels = kmeans.fit_predict(latent_representations)

In [8]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

print(f"Silhouette Score: {silhouette_score(latent_representations, labels):.4f}")
print(f"Calinski-Harabasz Score: {calinski_harabasz_score(latent_representations, labels):.2f}")
print(f"Davies-Bouldin Score: {davies_bouldin_score(latent_representations, labels):.4f}")

Silhouette Score: 0.0775
Calinski-Harabasz Score: 542.41
Davies-Bouldin Score: 4.0081


In [10]:
import pandas as pd

submission = pd.DataFrame({
    "ID": range(len(labels)),
    "TARGET": labels
})

submission.to_csv("submission_submit_autoencoders_1.csv", index=False)