In [None]:
from transformers import DistilBertTokenizer, DistilBertModel
from transformers import RobertaTokenizer, RobertaModel
import torch
import time
torch.cuda.is_available(), torch.cuda.current_device()

In [None]:
import torch
from torch import nn
from sklearn.cluster import KMeans

class ClusteringLayer(nn.Module):
    
    def __init__(self, initial_centroids, alpha=1.0):
        super(ClusteringLayer, self).__init__()
        self.weights = nn.Parameter(initial_centroids)
        self.alpha = alpha
    
    def forward(self, inputs):
        """
        Takes a batch of input embeddings of shape (batch_size, embedding_size).
        And computes of soft clustering based in on the centroids.
        """
        
        #linear_projection = nn.Linear(in_features=768, out_features=10)
        #linear_projection = linear_projection.double()
        #linear_projection = linear_projection.to('cuda')
        #
        #tanh = nn.Tanh()
        #tanh = tanh.double()
        #tanh = tanh.to('cuda')
        
        #weights = nn.Parameter(tanh(linear_projection(self.weights)))
        
        
        q = 1.0 / (1.0 + (torch.sum(torch.pow(
            torch.unsqueeze(inputs, 1) - self.weights, exponent=2), dim=2) / self.alpha))
        q = torch.pow(q, exponent=(self.alpha + 1.0) / 2.0)
        q = torch.transpose(torch.transpose(q, 0, 1) / torch.sum(q, dim=1), 0, 1)
        return q
    
    def __repr__(self):
        return str(self.weights.shape)

    
class DistilBertForClustering(nn.Module):
    
    def __init__(self, distilbert_model, initial_centroids):
        super(DistilBertForClustering, self).__init__()
        self.distilbert_model = distilbert_model
        self.initial_centroids = initial_centroids
        self.clustering_layer = ClusteringLayer(initial_centroids=initial_centroids)
        
    def target_probability_distribution(self, q):
        p = q ** 2 / q.sum(0)
        p = p / p.sum(dim=1, keepdim=True)
        return p
    
    def forward(self, inputs):
        distilbert_outputs = self.distilbert_model(**inputs)
        cls_embeddings = distilbert_outputs.last_hidden_state[:,0,:]
        
        #cls_embeddings = cls_embeddings.double()
        #linear_projection = nn.Linear(in_features=768, out_features=10)
        #linear_projection = linear_projection.double()
        #linear_projection = linear_projection.to('cuda')
        #tanh = nn.Tanh()
        #tanh = tanh.double()
        #tanh = tanh.to('cuda')
        
        #cls_embeddings = linear_projection(cls_embeddings)
        #cls_embeddings = tanh(cls_embeddings)
        
        q = self.clustering_layer(cls_embeddings)
        p = self.target_probability_distribution(q)
        return q, p

In [None]:
import torch
from torch import nn
from sklearn.cluster import KMeans

class ClusteringLayer2(nn.Module):
    
    def __init__(self, initial_centroids, alpha=1.0):
        super(ClusteringLayer2, self).__init__()
        self.weights = nn.Parameter(initial_centroids)
        self.alpha = alpha
    
    def forward(self, inputs):
        """
        Takes a batch of input embeddings of shape (batch_size, embedding_size).
        And computes of soft clustering based in on the centroids.
        """
        
        #linear_projection = nn.Linear(in_features=768, out_features=10)
        #linear_projection = linear_projection.double()
        #linear_projection = linear_projection.to('cuda')
        #
        #tanh = nn.Tanh()
        #tanh = tanh.double()
        #tanh = tanh.to('cuda')
        
        #weights = nn.Parameter(tanh(linear_projection(self.weights)))
        
        
        norm_squared = torch.sum((inputs.unsqueeze(1) - self.weights) ** 2, 2)
        numerator = 1.0 / (1.0 + (norm_squared / self.alpha))
        power = float(self.alpha + 1) / 2
        q = numerator ** power
        return q
    
    def __repr__(self):
        return str(self.weights.shape)

    
class DistilBertForClustering2(nn.Module):
    
    def __init__(self, distilbert_model, initial_centroids):
        super(DistilBertForClustering2, self).__init__()
        self.distilbert_model = distilbert_model
        self.initial_centroids = initial_centroids
        self.clustering_layer = ClusteringLayer2(initial_centroids=initial_centroids)
        
    def target_probability_distribution(self, q):
        weight = (q ** 2) / torch.sum(q, 0)
        return (weight.t() / torch.sum(weight, 1)).t()
    
    #def target_probability_distribution(self, q):
    #    a = q.argmax(1)
    #    a = a.to('cuda')
    #    p = torch.zeros(q.shape)
    #    p = p.to('cuda')
    #    p = p.scatter(1, a.unsqueeze(1), 1.0)
    #    p = p.to('cuda')
    #    p = p.double()
    #    return p
        
    
    def forward(self, inputs):
        distilbert_outputs = self.distilbert_model(**inputs)
        cls_embeddings = distilbert_outputs.last_hidden_state[:,0,:]
        
        #cls_embeddings = cls_embeddings.double()
        #linear_projection = nn.Linear(in_features=768, out_features=10)
        #linear_projection = linear_projection.double()
        #linear_projection = linear_projection.to('cuda')
        #tanh = nn.Tanh()
        #tanh = tanh.double()
        #tanh = tanh.to('cuda')
        
        #cls_embeddings = linear_projection(cls_embeddings)
        #cls_embeddings = tanh(cls_embeddings)
        
        q = self.clustering_layer(cls_embeddings)
        p = self.target_probability_distribution(q)
        return q, p

In [None]:
import numpy as np
import torch
from typing import Optional
from scipy.optimize import linear_sum_assignment


def cluster_accuracy(y_true, y_predicted, cluster_number: Optional[int] = None):
    """
    Calculate clustering accuracy after using the linear_sum_assignment function in SciPy to
    determine reassignments.
    :param y_true: list of true cluster numbers, an integer array 0-indexed
    :param y_predicted: list  of predicted cluster numbers, an integer array 0-indexed
    :param cluster_number: number of clusters, if None then calculated from input
    :return: reassignment dictionary, clustering accuracy
    """
    if cluster_number is None:
        cluster_number = (
            max(y_predicted.max(), y_true.max()) + 1
        )  # assume labels are 0-indexed
    count_matrix = np.zeros((cluster_number, cluster_number), dtype=np.int64)
    for i in range(y_predicted.size):
        count_matrix[y_predicted[i], y_true[i]] += 1

    row_ind, col_ind = linear_sum_assignment(count_matrix.max() - count_matrix)
    reassignment = dict(zip(row_ind, col_ind))
    accuracy = count_matrix[row_ind, col_ind].sum() / y_predicted.size
    return reassignment, accuracy

In [None]:
from transformers import DistilBertTokenizer, DistilBertModel

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', return_dict=True)
distilbert_model = DistilBertModel.from_pretrained('distilbert-base-uncased', return_dict=True)
distilbert_model.to('cuda')

# 20 Newsgroups

In [None]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
data = fetch_20newsgroups(
    categories=['comp.graphics', 'talk.religion.misc', 'rec.autos', 'sci.med'],
    remove=('headers', 'footers', 'quotes')
)
texts, labels = data.data, data.target

In [None]:
#from stop_words import get_stop_words
#from sklearn.datasets import fetch_20newsgroups
#import re
#import spacy
#
#
## create syntethic texts
#data = fetch_20newsgroups(
#    categories=['comp.graphics', 'talk.religion.misc'],
#    remove=('headers', 'footers', 'quotes')
#)
#texts, labels = data.data , data.target
#
#cleaned_texts = []
#stopwords = get_stop_words('en')
#
#nlp = spacy.load('en_core_web_sm')
#
#for text in texts:
#    #tokens = [token for token in re.findall(r'\w+', text) if token not in stopwords]
#    
#    doc = nlp(text)
#    noun_phrases = [str(np) for np in doc.noun_chunks]
#    
#    cleaned_texts.append(" ".join(noun_phrases))
#texts = cleaned_texts

In [None]:
from imblearn.under_sampling import RandomUnderSampler

under_sampler = RandomUnderSampler()
texts, labels = under_sampler.fit_resample([[t] for t in texts], labels)
texts = [t[0] for t in texts]



# IMDB Reviews

In [None]:
#import tensorflow_datasets as tfds
#from itertools import chain
#
#train_ds = tfds.load('imdb_reviews', split='train', shuffle_files=True)
#test_ds = tfds.load('imdb_reviews', split='test', shuffle_files=True)
#
#texts, labels = [], []
#
#for ds in (train_ds, test_ds):
#    for example in tfds.as_numpy(ds):
#        text, label = example['text'], example['label']
#        texts.append(str(text))
#        labels.append(label)
#
#labels = np.array(labels)

In [None]:
len(texts), len(labels)

In [None]:
# Debug
#from sklearn.model_selection import train_test_split
#texts, _, labels, _ = train_test_split(texts, labels, test_size=0.75)
#len(texts), np.unique(labels, return_counts=True)

In [None]:
from sklearn.model_selection import train_test_split


# train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=, random_state=42)
train_texts, train_labels = texts, labels
test_texts, test_labels = ["DUMMY"], [0]

In [None]:
from tqdm import tqdm
embeddings = []
for index, text in tqdm(enumerate(train_texts)):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    inputs = inputs.to('cuda')
    outputs = distilbert_model(**inputs)
    cls_embedding = outputs.last_hidden_state[:,0,:].flatten().cpu().detach().numpy()
    embeddings.append(cls_embedding)

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=np.unique(labels).shape[0], n_init=20)

kmeans_cluster = kmeans.fit_predict(embeddings)
kmeans_centroids = torch.from_numpy(kmeans.cluster_centers_)
kmeans_centroids = kmeans_centroids.to('cuda')
kmeans_centroids, kmeans_centroids.shape

In [None]:
from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score

print(cluster_accuracy(train_labels, kmeans_cluster)[1])
print(normalized_mutual_info_score(train_labels, kmeans_cluster))
print(adjusted_rand_score(train_labels, kmeans_cluster))

In [None]:
cluster_model = DistilBertForClustering2(distilbert_model=distilbert_model, initial_centroids=kmeans_centroids)
cluster_model.distilbert_model.requires_grad_(True)
cluster_model.to('cuda')

In [None]:
from torch.utils.data import Dataset, DataLoader


class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer


        self.labels = torch.tensor(self.labels, dtype=torch.float)
        
    def __getitem__(self, index):
        return self.texts[index], labels[index]

    
    def __len__(self):
        return len(self.texts)

In [None]:
train_data = TextDataset(train_texts, train_labels, tokenizer=tokenizer)
test_data = TextDataset(test_texts, test_labels, tokenizer=tokenizer)

In [None]:
train_loader = DataLoader(dataset=train_data, batch_size=16, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=16, shuffle=True)

In [None]:
#optimizer = torch.optim.SGD(params=cluster_model.parameters(), lr=1e-03)
#optimizer = torch.optim.SGD(params=cluster_model.parameters(), lr=0.01, momentum=0.9)
optimizer = torch.optim.AdamW(params=cluster_model.parameters(), lr=0.00001)

In [None]:
from tqdm import tqdm
from sklearn.metrics import normalized_mutual_info_score
from sklearn.metrics.cluster import adjusted_rand_score

def kld(target, pred):
            return torch.mean(torch.sum(target*torch.log(target/(pred+1e-6)), dim=1))
    
def kl_divergence(target, pred):
    return torch.mean(torch.sum(target * torch.log(target/pred), dim=1))

loss_fn = nn.KLDivLoss(reduction='sum')


def train(epoch):
    cluster_model.train()  # sets the model intr o trainnode => Some Layers like Normalization or Dropout are activated
    for param in cluster_model.distilbert_model.parameters():
        param.requires_grad = False
    
    train_pbar = tqdm(enumerate(train_loader))
    for batch_index, batch_data in train_pbar:
        texts, _ = batch_data
        inputs = tokenizer(
            texts,
            return_tensors='pt',
            padding=True,
            truncation=True
        )
        inputs = inputs.to('cuda')
        q, p = cluster_model(inputs)
        #loss = kl_divergence(p, q)
        #loss = kld(p, q)
        loss = loss_fn(q.log(), p)
        optimizer.zero_grad()  # clears the gradients (from previous optimization step)
        loss.backward()  # backpropagation step through the model
        optimizer.step()  # updated the weights of each layer using the computed gradients
        train_pbar.set_description(f'Epoch {epoch + 1} | Loss {loss.item()}')
    
    predicted_labels_total = []
    with torch.no_grad():
        cluster_model.eval()
        true_labels = []
        predicted_labels = []
        test_pbar = tqdm(enumerate(train_loader))
        for batch_index, batch_data in test_pbar:
            texts, labels = batch_data
            inputs = tokenizer(
                texts,
                return_tensors='pt',
                padding=True,
                truncation=True
            )
            inputs = inputs.to('cuda')
            q, p = cluster_model(inputs)
            
            predicted_label = q.argmax(dim=1).cpu().detach().numpy()
            true_label = labels.cpu().detach().numpy()
            
            predicted_labels.extend(predicted_label)
            true_labels.extend(true_label)
            
        true_labels = np.array(true_labels).flatten()
        predicted_labels = np.array(predicted_labels).flatten()
        predicted_labels_total.extend(predicted_labels)

        print('#'*60)
        print(normalized_mutual_info_score(true_labels, predicted_labels))
        print(cluster_accuracy(true_labels, predicted_labels))
        print(adjusted_rand_score(true_labels, predicted_labels))
        return np.array(predicted_labels_total)

In [None]:
start_params = np.array([param.cpu().detach().numpy() for param in distilbert_model.parameters()])

In [None]:
predictions = []
for epoch in range(5):
    predicted_labels = train(epoch)
    predictions.append(train(epoch))

In [None]:
torch.save(cluster_model, f"cluster_model_{time.time()}.bin")
torch.save(distilbert_model, f"distilbert_model_{time.time()}.bin")

In [None]:
from tqdm import tqdm
tuned_embeddings = []
for index, text in tqdm(enumerate(train_texts)):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    inputs = inputs.to('cuda')
    outputs = distilbert_model(**inputs)
    cls_embedding = outputs.last_hidden_state[:,0,:].flatten().cpu().detach().numpy()
    tuned_embeddings.append(cls_embedding)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
Xr = pca.fit_transform(embeddings)
Xrt = pca.fit_transform(tuned_embeddings)

In [None]:
import seaborn as sns

untuned_plot = sns.scatterplot(x=Xr[:,0], y=Xr[:,1], hue=[f'C{i}' for i in predicted_labels])

In [None]:
untuned_plot.get_figure().savefig(f'untuned_plot_{time.time()}.png')

In [None]:
tuned_plot = sns.scatterplot(x=Xrt[:,0], y=Xrt[:,1], hue=[f'C{i}' for i in predicted_labels])

In [None]:
tuned_plot.get_figure().savefig(f'tuned_plot_{time.time()}.png')

In [None]:
raise Exception("Stop here!")

In [None]:
end_params = np.array([param.cpu().detach().numpy() for param in distilbert_model.parameters()])

In [None]:
start_params[0] == end_params[0]

In [None]:
end_params[0][start_params[0] == end_params[0]]

In [None]:
cluster_model.distilbert_model.requires_grad_(True)

In [None]:
predicted_labels[predicted_labels != 0]

In [None]:
index = 100
print(texts[index])
inputs = tokenizer(texts[index], return_tensors='pt', padding=True, truncation=True)
inputs = inputs.to('cuda')
q, p = cluster_model(inputs)

In [None]:
q, p

In [None]:
loss_fn(q.log(), p)

In [None]:
p.sum()

In [None]:
np.array(embeddings)

In [None]:
from scipy.spatial.distance import pdist, euclidean, squareform

distances = squareform(pdist(embeddings))

In [None]:
distances[0, :100]

In [None]:
from sklearn.datasets import make_blobs, make_classification, make_circles

X, y = make_classification(n_samples=100000, n_classes=3, n_clusters_per_class=3, n_features=20, n_informative=10)


In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3)
kmeans_pred = kmeans.fit_predict(X)

kmeans_centroids = kmeans.cluster_centers_

In [None]:
cluster_accuracy(y, kmeans_pred)

In [None]:
import seaborn as sns
from sklearn.decomposition import PCA
pca = PCA(n_components=2)

Xr = pca.fit_transform(X)
sns.scatterplot(x=Xr[:,0], y=Xr[:,1], hue=[f'C{i}' for i in y])

In [None]:
def train_clusteringlayer(X, y, clustering_layer, optimizer, loss_fn):
    
    
    #X = torch.from_numpy(X)
    
    
    def target_probability_distribution(q):
        weight = (q ** 2) / torch.sum(q, 0)
        return (weight.t() / torch.sum(weight, 1)).t()
    
    
    clustering_layer = clustering_layer.train()
    q = clustering_layer(X)
    
    p = target_probability_distribution(q)
    
    loss = loss_fn(q.log(), p)
    
    optimizer.zero_grad()  # clears the gradients (from previous optimization step)
    
    loss.backward()  # backpropagation step through the model
    
    optimizer.step()
    
    predicted_label = q.argmax(dim=1).cpu().detach().numpy()
    _, accuracy = cluster_accuracy(y, predicted_label)
    #print(f'Loss: {loss.item()}')
    #print(f'Accuracy: {}')
    
    return p, q, loss.item(), accuracy

In [None]:
import torch
from torch import nn
from sklearn.cluster import KMeans

class ClusteringLayer3(nn.Module):
    
    def __init__(self, initial_centroids, alpha=1.0, in_features=768, hidden_dim=100):
        super(ClusteringLayer3, self).__init__()
        self.linear_layer = nn.Linear(in_features, hidden_dim).double()
        self.weights = nn.Parameter(initial_centroids)
        self.alpha = alpha
    
    def forward(self, inputs):
        """
        Takes a batch of input embeddings of shape (batch_size, embedding_size).
        And computes of soft clustering based in on the centroids.
        """
        
        #linear_projection = nn.Linear(in_features=768, out_features=10)
        #linear_projection = linear_projection.double()
        #linear_projection = linear_projection.to('cuda')
        #
        #tanh = nn.Tanh()
        #tanh = tanh.double()
        #tanh = tanh.to('cuda')
        
        #weights = nn.Parameter(tanh(linear_projection(self.weights)))
        
        embedded_weights = self.linear_layer(self.weights)
        
        norm_squared = torch.sum((inputs.unsqueeze(1) - embedded_weights) ** 2, 2)
        numerator = 1.0 / (1.0 + (norm_squared / self.alpha))
        power = float(self.alpha + 1) / 2
        q = numerator ** power
        return q
    
    def __repr__(self):
        return str(self.weights.shape)

In [None]:
class TestModel(nn.Module):
    
    def __init__(self, initial_centroids, in_features=768, hidden_dim=100):
        super(TestModel, self).__init__()
        self.linear_layer = nn.Linear(in_features, hidden_dim).double().cuda()
        self.relu = nn.ReLU().double().cuda()
        self.clustering_layer = ClusteringLayer3(initial_centroids=initial_centroids)
    def forward(self, X):
        
        return self.clustering_layer(self.relu(self.linear_layer(X)))

In [None]:
from tqdm import tqdm



initial_centroids = torch.from_numpy(kmeans_centroids).cuda()
clustering_layer = ClusteringLayer2(initial_centroids=initial_centroids).cuda()
#test_model = TestModel(initial_centroids=initial_centroids).cuda()

loss_fn = nn.KLDivLoss(reduction='sum')
optimizer = torch.optim.AdamW(params=clustering_layer.parameters(), lr=0.001)

X = torch.from_numpy(X).cuda()

losses = []
acc = []
pbar = tqdm(range(2500))
for i in pbar:
    p, q, loss, accuracy = train_clusteringlayer(X=X, y=y,
                                                 clustering_layer=clustering_layer,
                                                 optimizer=optimizer,
                                                 loss_fn=loss_fn)
    
    losses.append(loss)
    acc.append(accuracy)
    pbar.set_description(f'Accuracy {accuracy} | Loss {loss}')

In [None]:
np.unique(q.argmax(dim=1).cpu().detach().numpy(), return_counts=True)

In [None]:
import matplotlib.pyplot as plt
plt.plot(losses)

In [None]:
plt.plot(acc)

In [None]:
raise Exception("Stop here")

In [21]:
from transformers import DistilBertTokenizer, DistilBertForMaskedLM
import torch

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased', return_dict=True)

input_ids = tokenizer("Hello, my dog is cute", return_tensors="pt")["input_ids"]

outputs = model(input_ids, labels=input_ids)
loss = outputs.loss
prediction_logits = outputs.logits

In [28]:
model.base_model(input_ids)

BaseModelOutput(last_hidden_state=tensor([[[-1.8296e-01, -7.4054e-02,  5.0267e-02,  ..., -1.1261e-01,
           4.4493e-01,  4.0941e-01],
         [ 7.0632e-04,  1.4825e-01,  3.4328e-01,  ..., -8.6040e-02,
           6.9475e-01,  4.3353e-02],
         [-5.0721e-01,  5.3086e-01,  3.7163e-01,  ..., -5.6287e-01,
           1.3756e-01,  2.8475e-01],
         ...,
         [-4.2251e-01,  5.7314e-02,  2.4338e-01,  ..., -1.5223e-01,
           2.4462e-01,  6.4155e-01],
         [-4.9384e-01, -1.8895e-01,  1.2641e-01,  ...,  6.3241e-02,
           3.6913e-01, -5.8252e-02],
         [ 8.3269e-01,  2.4948e-01, -4.5440e-01,  ...,  1.1998e-01,
          -3.9257e-01, -2.7785e-01]]], grad_fn=<NativeLayerNormBackward>), hidden_states=None, attentions=None)