# **Imports**

In [57]:
import os
import torch
from umap import UMAP
import glob
import torch
from sklearn.cluster import DBSCAN, AgglomerativeClustering, KMeans
import umap
import matplotlib.pyplot as plt
import numpy as np
from yellowbrick.cluster import KElbowVisualizer
import warnings
import torchvision.transforms as T
warnings.filterwarnings('ignore')

# **Values**

In [30]:
folder_path = "./vectorized_torch_emb_disp_up/vectorized_torch_emb/"

# **Clusterization**

In [66]:
word_embeddings = []
with torch.no_grad():
    for filename in os.listdir(folder_path):
        if filename.endswith('.pt'):
            embeddings = torch.load(os.path.join(folder_path, filename), weights_only=True)
            embeddings = T.transforms.F.resize(embeddings.unsqueeze(0), (34, 768))
            word_embeddings.append(embeddings)

    x = [i.shape[1] for i in word_embeddings]
    print(min(x), max(x))
    word_embeddings = torch.cat(word_embeddings, dim=0)
    embeddings_list = word_embeddings.squeeze().detach().numpy().mean(axis=1)
    umap_embeddings = umap.UMAP(n_neighbors=15, min_dist=0.000001, n_components=2, metric = "cosine").fit_transform(embeddings_list)

34 34


In [67]:
from sklearn.model_selection import train_test_split

c = AgglomerativeClustering(n_clusters=2).fit(umap_embeddings)
X_train, X_test, y_train, y_test = train_test_split(
    word_embeddings, 
    torch.tensor(c.labels_), 
    test_size=0.3, 
    random_state=42, 
    stratify=c.labels_
)

# Create dataset

In [68]:
from torch.utils.data import DataLoader, TensorDataset

In [69]:
dataset_train = TensorDataset(
    X_train, 
    torch.nn.functional.one_hot(y_train, 2)
)
dataloader_train = DataLoader(dataset_train, batch_size=32)


# Create model

In [70]:
torch.nn.Conv1d(768, 768, kernel_size=2, padding=0)(word_embeddings.transpose(1,2)).shape

torch.Size([218, 768, 33])

In [72]:
word_embeddings[0].shape

torch.Size([34, 768])

In [96]:
class Model(torch.nn.Module):
    def __init__(self, seq_len=2, emb_size=768):
        super().__init__()
        self.model = torch.nn.Sequential(
            torch.nn.Conv1d(34, 20, kernel_size=3, padding=1),
            torch.nn.Conv1d(20, 10, kernel_size=3, padding=1),
            torch.nn.Conv1d(10, 1, kernel_size=3, padding=1),
            torch.nn.Flatten(),
            torch.nn.Linear(768, 250),
            torch.nn.Linear(250, 128),
            torch.nn.Linear(128, 2),
            torch.nn.Softmax(dim=1),
        )
    def forward(self, x):
        return self.model(x)
    


In [103]:
model = Model()
loss = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

In [98]:
torch.nn.Conv1d(34, 20, kernel_size=3, padding=1)(x).shape

torch.Size([24, 20, 768])

In [102]:
lm = []
for e in range(20):
    for x, y in dataloader_train:
        x.detach().requir_grad = True
        y.detach().requir_grad = True
        o = model(x.float())
        l = loss(o, y.float())
        l.backward()
        lm.append(l.item())
        optimizer.zero_grad()
        optimizer.step()
    print(np.mean(lm))


0.25741976499557495
0.25741976499557495
0.25741976499557495
0.25741976499557495
0.25741976499557495
0.25741976499557495
0.25741976499557495
0.25741976499557495
0.25741976499557495
0.25741976499557495
0.25741976499557495
0.25741976499557495
0.25741976499557495
0.25741976499557495
0.25741976499557495
0.25741976499557495
0.25741976499557495
0.25741976499557495
0.25741976499557495
0.25741976499557495


In [100]:
model(x)

tensor([[0.4974, 0.5026],
        [0.4970, 0.5030],
        [0.4971, 0.5029],
        [0.4974, 0.5026],
        [0.4974, 0.5026],
        [0.4974, 0.5026],
        [0.4971, 0.5029],
        [0.4974, 0.5026],
        [0.4956, 0.5044],
        [0.4974, 0.5026],
        [0.4974, 0.5026],
        [0.4974, 0.5026],
        [0.4937, 0.5063],
        [0.4950, 0.5050],
        [0.4974, 0.5026],
        [0.4950, 0.5050],
        [0.4974, 0.5026],
        [0.4950, 0.5050],
        [0.4961, 0.5039],
        [0.4937, 0.5063],
        [0.4937, 0.5063],
        [0.4971, 0.5029],
        [0.4950, 0.5050],
        [0.4974, 0.5026]], grad_fn=<SoftmaxBackward0>)