In [1]:
from sklearn.datasets import fetch_rcv1
from sklearn.decomposition import PCA, SparsePCA
from joblib import dump, load
import torch
import numpy
from sklearn.preprocessing import normalize

import sys
sys.path.append('..')

from clustering_tool.model import DeepClusteringModel
from clustering_tool.modules.clusterers.XieClusterer import XieClusterer
from clustering_tool.modules.embedders.PcaEmbedder import PcaEmbedder
from clustering_tool.modules.encoders.feedforward import MyFeedForward
from clustering_tool.modules.metrics.NormalizedMutualInformation import NormalizedMutualInformation

In [2]:
dataset = fetch_rcv1(data_home='../data', subset="train", shuffle=True)

In [3]:
dataset.data.shape

(23149, 47236)

In [4]:
#pca = PCA(2000)
#pca.fit(dataset.data.toarray())
#dump(pca, '../reuters_pca.joblib')

In [5]:
pca = load('../reuters_pca.joblib')
pca.transform(dataset.data[0:1000].toarray()).max()

0.6768940886439831

In [6]:
#dump(pca, '../data/reuters_pca.joblib')

In [7]:
linear = torch.nn.Linear(2000, 200)
dropout = torch.nn.Dropout(p=0.0)
activation = torch.nn.ReLU()

In [8]:
embedder = PcaEmbedder("../data/reuters_pca.joblib")
embedder.requires_grad_(False)
encoder = MyFeedForward(2000, 1, 200, torch.nn.Sigmoid())
clusterer = XieClusterer(103, 200)

model = DeepClusteringModel(None, encoder, clusterer, 103, embedder = embedder)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)

In [9]:
x = torch.tensor(dataset.data[0:2].toarray(), dtype=torch.float)
y = torch.tensor(dataset.target[0:2].toarray(), dtype=torch.float)
#x = torch.Tensor(pca.transform(dataset.data[0:10].toarray()))

In [10]:
model.forward(x, y) # Все дело в эмбедере

{'h': tensor([[0.5029, 0.5021, 0.4950, 0.4973, 0.5033, 0.5004, 0.4998, 0.5036, 0.4995,
          0.4967, 0.4971, 0.5041, 0.5047, 0.5002, 0.5034, 0.5018, 0.5072, 0.5058,
          0.5054, 0.4960, 0.4987, 0.4971, 0.4963, 0.5039, 0.5031, 0.4912, 0.4978,
          0.4987, 0.5012, 0.5042, 0.4992, 0.4920, 0.4959, 0.4930, 0.5040, 0.5021,
          0.5068, 0.5073, 0.4968, 0.5014, 0.4986, 0.4971, 0.4952, 0.5000, 0.5009,
          0.4984, 0.5016, 0.4972, 0.5026, 0.5018, 0.5048, 0.5064, 0.5072, 0.4987,
          0.4990, 0.4961, 0.4913, 0.5020, 0.5056, 0.5012, 0.4993, 0.4972, 0.5014,
          0.4923, 0.4986, 0.4952, 0.5064, 0.5015, 0.4945, 0.4945, 0.4957, 0.5011,
          0.4932, 0.4915, 0.4981, 0.5063, 0.4994, 0.4957, 0.4992, 0.5013, 0.4965,
          0.5020, 0.5024, 0.4969, 0.5038, 0.4982, 0.4946, 0.4954, 0.5066, 0.4950,
          0.4951, 0.4925, 0.5018, 0.4969, 0.4995, 0.5023, 0.5037, 0.5006, 0.5008,
          0.4945, 0.5067, 0.4989, 0.4993, 0.5000, 0.4988, 0.4995, 0.5059, 0.4996,
          0

In [11]:
optimizer.zero_grad()
model.forward(x, y)['loss'].backward()
optimizer.step()

In [12]:
model.forward(x, y)

{'h': tensor([[0.5029, 0.5021, 0.4950, 0.4973, 0.5033, 0.5004, 0.4998, 0.5036, 0.4995,
          0.4967, 0.4971, 0.5041, 0.5047, 0.5002, 0.5034, 0.5018, 0.5072, 0.5058,
          0.5054, 0.4960, 0.4987, 0.4971, 0.4963, 0.5039, 0.5031, 0.4912, 0.4978,
          0.4987, 0.5012, 0.5042, 0.4992, 0.4920, 0.4959, 0.4930, 0.5040, 0.5021,
          0.5068, 0.5073, 0.4968, 0.5014, 0.4986, 0.4971, 0.4952, 0.5000, 0.5009,
          0.4984, 0.5016, 0.4972, 0.5026, 0.5018, 0.5048, 0.5064, 0.5072, 0.4987,
          0.4990, 0.4961, 0.4913, 0.5020, 0.5056, 0.5012, 0.4993, 0.4972, 0.5014,
          0.4923, 0.4986, 0.4952, 0.5064, 0.5015, 0.4945, 0.4945, 0.4957, 0.5011,
          0.4932, 0.4915, 0.4981, 0.5063, 0.4994, 0.4957, 0.4992, 0.5013, 0.4965,
          0.5020, 0.5024, 0.4969, 0.5038, 0.4982, 0.4946, 0.4954, 0.5066, 0.4950,
          0.4951, 0.4925, 0.5018, 0.4969, 0.4995, 0.5023, 0.5037, 0.5006, 0.5008,
          0.4945, 0.5067, 0.4989, 0.4993, 0.5000, 0.4988, 0.4995, 0.5059, 0.4996,
          0

In [13]:
list(embedder.modules())

[PcaEmbedder()]

In [14]:
a = torch.tensor([[1, 2, 3], [2, 2, 3]])
b = torch.tensor([1, 2, 3]).unsqueeze(-1)
torch.matmul(a, b)

tensor([[14],
        [15]])

In [15]:
a.unsqueeze(-1).shape

torch.Size([2, 3, 1])

In [16]:
a.unsqueeze(-1).shape

torch.Size([2, 3, 1])

In [17]:
torch.tensor(1e-20) * torch.log(torch.tensor(1e-20))

tensor(-4.6052e-19)

In [366]:
n_cl = 10
nmi = NormalizedMutualInformation(n_cl, n_cl)

In [379]:
x = numpy.random.random((5, n_cl))
y = numpy.random.random((5, n_cl))

y = numpy.zeros_like(x)
y_classes = numpy.argmax(x, axis=1)
for i, cl in enumerate(y_classes):
    y[i][cl] = 1

x = normalize(x, axis=1, norm='l1')
y = normalize(y, axis=1, norm='l1')

x = torch.tensor(x)
y = torch.tensor(y)

nmi(x, x)
nmi.get_metric(True)

2.0048186779022217

In [380]:
joint_distr = torch.matmul(torch.t(x), y)
joint_distr /= torch.sum(joint_distr)
joint_distr

tensor([[0.0000, 0.0000, 0.0700, 0.0000, 0.0153, 0.0000, 0.0000, 0.0176, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0528, 0.0000, 0.0116, 0.0000, 0.0000, 0.0229, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.1210, 0.0000, 0.0099, 0.0000, 0.0000, 0.0140, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0626, 0.0000, 0.0140, 0.0000, 0.0000, 0.0270, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0275, 0.0000, 0.0315, 0.0000, 0.0000, 0.0286, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0595, 0.0000, 0.0198, 0.0000, 0.0000, 0.0144, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0318, 0.0000, 0.0239, 0.0000, 0.0000, 0.0203, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0718, 0.0000, 0.0249, 0.0000, 0.0000, 0.0301, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0353, 0.0000, 0.0213, 0.0000, 0.0000, 0.0132, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0679, 0.0000, 0.0279, 0.0000, 0.0000, 0.0120, 0.0000,
         0.0000]], dtype=tor

In [381]:
x = torch.sum(x, axis=0)
x /= torch.sum(x)
y = torch.sum(y, axis=0)
y /= torch.sum(y)
y[y < 1e-12]=1e-12

In [382]:
xy_distr = torch.matmul(x.unsqueeze(-1), y.unsqueeze(0))
xy_distr

tensor([[1.0287e-13, 1.0287e-13, 6.1720e-02, 1.0287e-13, 2.0573e-02, 1.0287e-13,
         1.0287e-13, 2.0573e-02, 1.0287e-13, 1.0287e-13],
        [8.7250e-14, 8.7250e-14, 5.2350e-02, 8.7250e-14, 1.7450e-02, 8.7250e-14,
         8.7250e-14, 1.7450e-02, 8.7250e-14, 8.7250e-14],
        [1.4483e-13, 1.4483e-13, 8.6895e-02, 1.4483e-13, 2.8965e-02, 1.4483e-13,
         1.4483e-13, 2.8965e-02, 1.4483e-13, 1.4483e-13],
        [1.0364e-13, 1.0364e-13, 6.2182e-02, 1.0364e-13, 2.0727e-02, 1.0364e-13,
         1.0364e-13, 2.0727e-02, 1.0364e-13, 1.0364e-13],
        [8.7540e-14, 8.7540e-14, 5.2524e-02, 8.7540e-14, 1.7508e-02, 8.7540e-14,
         8.7540e-14, 1.7508e-02, 8.7540e-14, 8.7540e-14],
        [9.3612e-14, 9.3612e-14, 5.6167e-02, 9.3612e-14, 1.8722e-02, 9.3612e-14,
         9.3612e-14, 1.8722e-02, 9.3612e-14, 9.3612e-14],
        [7.5920e-14, 7.5920e-14, 4.5552e-02, 7.5920e-14, 1.5184e-02, 7.5920e-14,
         7.5920e-14, 1.5184e-02, 7.5920e-14, 7.5920e-14],
        [1.2677e-13, 1.2677

In [383]:
torch.nn.functional.kl_div(joint_distr, xy_distr, reduction='sum') / (torch.sum(torch.dot(x, torch.log(x))) + torch.sum(torch.dot(y, torch.log(y))))

tensor(1.0145, dtype=torch.float64)

In [384]:
torch.nn.functional.kl_div(joint_distr, xy_distr, reduction='sum')

tensor(-3.2771, dtype=torch.float64)

In [326]:
y

tensor([1.0000e-12, 1.0000e-12, 1.0000e-12, 2.0000e-01, 1.0000e-12, 1.0000e-12,
        2.0000e-01, 4.0000e-01, 2.0000e-01, 1.0000e-12], dtype=torch.float64)