In [None]:
from scipy import sparse
from sklearn.cluster import DBSCAN
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler

In [None]:
!pip install umap

In [None]:
from scipy import sparse

path = '/kaggle/input/hw3-mlda/train.npz'

data = sparse.load_npz(path)

In [None]:
data.shape

In [None]:
tsvd = TruncatedSVD(n_components=10000)
data = tsvd.fit_transform(data)

In [None]:
# scaler = StandardScaler()
# data = scaler.fit_transform(data)


In [None]:
import torch
from torch import nn, optim
from torch.utils.data import TensorDataset, DataLoader, Dataset
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler

**Reduced dimensions with AutoEncoder**

Не хватает оперативной памяти, чтобы полностью снизить размерность при помощи автоэнкодера. Возможно не получается набрать качества из-за плохого качества снижения размерности при помощи TruncatedSVD

In [None]:
class Autoencoder(nn.Module):

    def __init__(self, in_shape, enc_shape):
        super(Autoencoder, self).__init__()

        self.encode = nn.Sequential(
            nn.Linear(in_shape, 5000),
            nn.ReLU(),
            nn.Linear(5000, 2048),
            nn.BatchNorm1d(num_features=2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.BatchNorm1d(num_features=512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.BatchNorm1d(num_features=256),
            nn.ReLU(),
            nn.Linear(256, enc_shape),
        )

        self.decode = nn.Sequential(
            nn.Linear(enc_shape, 256),
            nn.BatchNorm1d(num_features=256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.BatchNorm1d(num_features=512),
            nn.ReLU(),
            nn.Linear(512, 1024),
            nn.ReLU(),
            nn.Linear(1024, 2048),
            nn.BatchNorm1d(num_features=2048),
            nn.ReLU(),
            nn.Linear(2048, 5000),
            nn.ReLU(),
            nn.Linear(5000, in_shape),
        )

    def forward(self, x):
        x = self.encode(x)
        x = self.decode(x)
        return x

In [None]:
in_shape = data.shape[1]

outp_shape = 128

print(f'input shape: {in_shape}\noutput shape: {outp_shape}')

input shape: 10000
output shape: 128


In [None]:
device = ('cuda' if torch.cuda.is_available() else 'cpu')

model = Autoencoder(in_shape, outp_shape).to(device)
loss_function = torch.nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5, weight_decay=1e-8)

In [None]:
from torch.utils.data import Dataset, DataLoader

class DataBuilder(Dataset):
    def __init__(self, data):
        self.x = data.astype(np.float32)
        self.x = torch.from_numpy(self.x).to(device)
        self.len=self.x.shape[0]
    def __getitem__(self,index):
        return self.x[index]
    def __len__(self):
        return self.len

In [None]:
data_set=DataBuilder(data)
trainloader=DataLoader(dataset=data_set, batch_size=256)

In [None]:
for batch_idx, data in enumerate(trainloader):
    print(data)
    break

tensor([[-0.5774, -0.1401, -0.5667,  ...,  1.3953,  0.5620,  0.4986],
        [-0.5866,  0.0187, -0.9178,  ..., -0.3092, -1.6825, -0.6424],
        [-0.7684, -0.0971, -0.4790,  ..., -0.5788, -0.0686,  0.6600],
        ...,
        [-0.3644, -0.4105, -0.0705,  ..., -0.8006, -0.4632, -0.5481],
        [ 1.2024, -2.5246,  1.3206,  ..., -0.1296, -0.1680,  0.4034],
        [-1.3497, -0.1204,  0.3824,  ...,  0.1806,  1.2808, -0.9061]],
       device='cuda:0')


In [None]:
from tqdm import tqdm

def train(epoch):
    loss_function = nn.MSELoss()
    model.train()
    train_loss = 0
    for batch_idx, data in enumerate(trainloader):
        data = data.to(device)
        optimizer.zero_grad()
        recon_batch = model(data)
        loss = loss_function(recon_batch, data)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()

    if epoch % 20 == 0:
        print('====> Epoch: {} Average loss: {:.4f}'.format(
            epoch, train_loss / len(trainloader.dataset)))
        train_losses.append(train_loss / len(trainloader.dataset))

In [None]:
epochs = 1000

val_losses = []
train_losses = []

for epoch in range(1, epochs + 1):
    train(epoch)

====> Epoch: 20 Average loss: 0.0038
====> Epoch: 40 Average loss: 0.0037
====> Epoch: 60 Average loss: 0.0037
====> Epoch: 80 Average loss: 0.0036
====> Epoch: 100 Average loss: 0.0035
====> Epoch: 120 Average loss: 0.0035
====> Epoch: 140 Average loss: 0.0034
====> Epoch: 160 Average loss: 0.0034
====> Epoch: 180 Average loss: 0.0033
====> Epoch: 200 Average loss: 0.0033
====> Epoch: 220 Average loss: 0.0032
====> Epoch: 240 Average loss: 0.0032
====> Epoch: 260 Average loss: 0.0031
====> Epoch: 280 Average loss: 0.0031
====> Epoch: 300 Average loss: 0.0030
====> Epoch: 320 Average loss: 0.0030
====> Epoch: 340 Average loss: 0.0030
====> Epoch: 360 Average loss: 0.0029
====> Epoch: 380 Average loss: 0.0029
====> Epoch: 400 Average loss: 0.0028
====> Epoch: 420 Average loss: 0.0028
====> Epoch: 440 Average loss: 0.0028
====> Epoch: 460 Average loss: 0.0028
====> Epoch: 480 Average loss: 0.0027
====> Epoch: 500 Average loss: 0.0027
====> Epoch: 520 Average loss: 0.0027
====> Epoch: 540

In [None]:
outp_emb = []
with torch.no_grad():
    for i, (data) in enumerate(trainloader):
        data = data.to(device)
        outp_emb.append(model.encode(data))
        result_emb = torch.cat(outp_emb, dim=0)

In [None]:
result_emb = result_emb.cpu().numpy()

In [None]:
result_emb.shape

(14590, 128)

**AgglomerativeClustering**

In [None]:
from sklearn.cluster import AgglomerativeClustering
from tqdm import tqdm
from time import time

subm = []

for linkage in tqdm(['ward', 'average', 'complete', 'single']):
    clustering = AgglomerativeClustering(linkage=linkage, n_clusters=3)
    clustering = clustering.fit(reduced_embedding)
    subm.append(clustering.labels_)

100%|██████████| 4/4 [00:45<00:00, 11.40s/it]


In [None]:
np.unique(subm[0], return_counts=True)

(array([0, 1, 2]), array([5346, 3878, 5366]))

In [None]:
np.unique(subm[0], return_counts=True)

(array([0, 1, 2]), array([6693, 4983, 2914]))

**end clust**

In [None]:
!pip install umap

Collecting umap
  Downloading umap-0.1.1.tar.gz (3.2 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: umap
  Building wheel for umap (setup.py) ... [?25ldone
[?25h  Created wheel for umap: filename=umap-0.1.1-py3-none-any.whl size=3541 sha256=fef8b0c7cdae6306a4348b5b8a087da4913f903ca52ed316d1674c82363f001d
  Stored in directory: /root/.cache/pip/wheels/15/f1/28/53dcf7a309118ed35d810a5f9cb995217800f3f269ab5771cb
Successfully built umap
Installing collected packages: umap
Successfully installed umap-0.1.1


In [None]:
from umap.umap_ import UMAP

reducer = UMAP(n_components=64)
reduced_embedding = reducer.fit_transform(result_emb)

In [None]:
np.array(reduced_embedding).shape

(14590, 64)

In [None]:
!pip install hdbscan

Collecting hdbscan
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m57.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting cython<3,>=0.27 (from hdbscan)
  Obtaining dependency information for cython<3,>=0.27 from https://files.pythonhosted.org/packages/f8/26/ca0f1bb049b83c25cafa39f3fa5287c826a6ab36e665c906209e07f4deac/Cython-0.29.36-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata
  Using cached Cython-0.29.36-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (3.1 kB)
Using cached Cython-0.29.36-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (1.9 MB)
Building wheels for collected packages: hdbscan
  Building wh

In [None]:
import hdbscan

labels = hdbscan.HDBSCAN(algorithm='best', alpha=1.0, approx_min_span_tree=True,
    gen_min_span_tree=False, leaf_size=40,
    metric='minkowski', min_cluster_size=3500, min_samples=1000, p=1).fit_predict(reduced_embedding)

In [None]:
np.unique(labels)

array([-1])

In [None]:
import pandas as pd



idx = []
label = []
for i in range(len(subm[0])):
    idx.append(i)
    label.append(subm[0][i])

df = pd.DataFrame({'ID': idx, 'TARGET': label})
df.to_csv(f'subm_kirichenko_scaled_640.csv', index=False)

**Spectral Clustering**

In [None]:
from sklearn.cluster import SpectralClustering

In [None]:
from scipy import sparse

path = '/kaggle/input/hw3-mlda/train.npz'

data = sparse.load_npz(path)

In [None]:
tsvd = TruncatedSVD(n_components=15000)
data = tsvd.fit_transform(data)

In [None]:
data

In [None]:
clustering = SpectralClustering(n_clusters=3)

labels = clustering.fit_predict(data)

In [None]:
labels

array([1, 1, 1, ..., 1, 0, 1], dtype=int32)

In [None]:
labels

In [None]:
import pandas as pd

df = pd.DataFrame({'ID': [i for i in range(len(labels))], 'TARGET': labels})
df.to_csv(f'subm_kirichenko_scaled_lastTochnoReduced.csv', index=False)

In [None]:
data

<14590x77888 sparse matrix of type '<class 'numpy.float64'>'
	with 2026769 stored elements in Compressed Sparse Row format>

**Reduce dimensions with UMAP and SpectarlClustering**

In [None]:
from scipy import sparse

path = '/kaggle/input/hw3-mlda/train.npz'

data = sparse.load_npz(path)

In [None]:
from umap.umap_ import UMAP

reducer = UMAP(n_components=10000)
reduced_embedding = reducer.fit_transform(data)

In [None]:
clustering = SpectralClustering(n_clusters=3)

labels = clustering.fit_predict(reduced_embedding)