In [1]:
import os
import numpy as np
import pandas as pd
import random
import time

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim

from sklearn.metrics import roc_auc_score

import sys
sys.path.append("/home/sondors/Documents/price/ColBERT")

from interface import load_model#, get_query_emb_batch
from typing import List, Dict
from colbert.modeling.checkpoint import Checkpoint

import wandb

from transformers import AdamW, get_linear_schedule_with_warmup

class Encoder(nn.Module):
    def __init__(self, input_dim=(32, 768), latent_dim=128):
        super(Encoder, self).__init__()
        self.fc1 = nn.Linear(input_dim[0] * input_dim[1], 1024)  # Flatten input
        self.fc2 = nn.Linear(1024, 256)
        self.fc31 = nn.Linear(256, latent_dim)
        self.fc32 = nn.Linear(256, latent_dim)

    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten the matrix to a vector
        h = F.relu(self.fc1(x))
        h = F.relu(self.fc2(h))
        return self.fc31(h), self.fc32(h)

class Decoder(nn.Module):
    def __init__(self, latent_dim=128, output_dim=(32, 768)):
        super(Decoder, self).__init__()
        self.fc1 = nn.Linear(latent_dim, 256)
        self.fc2 = nn.Linear(256, 1024)
        self.fc3 = nn.Linear(1024, output_dim[0] * output_dim[1])
        self.output_dim = output_dim

    def forward(self, z):
        h = F.relu(self.fc1(z))
        h = F.relu(self.fc2(h))
        return torch.sigmoid(self.fc3(h)).view(-1, self.output_dim[0], self.output_dim[1])  # Reshape to original

class VAE(nn.Module):
    def __init__(self):
        super(VAE, self).__init__()
        self.encoder = Encoder()
        self.decoder = Decoder()

    def reparameterize(self, mu, log_var):
        std = torch.exp(0.5*log_var)
        eps = torch.randn_like(std)
        return mu + eps*std

    def forward(self, x):
        mu, log_var = self.encoder(x)
        z = self.reparameterize(mu, log_var)
        return self.decoder(z), mu, log_var

class SiameseNetwork(nn.Module):
    def __init__(self, latent_dim=128):
        super(SiameseNetwork, self).__init__()
        self.fc1 = nn.Linear(latent_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)

    def forward(self, z1, z2):
        h1 = F.relu(self.fc1(z1))
        h2 = F.relu(self.fc1(z2))
        diff = torch.abs(h1 - h2)
        out = self.fc2(diff)
        return self.fc3(out).squeeze(1)  # Ensure this line produces a shape of [batch_size]

def get_query_emb(sentences: List[str], checkpoint: Checkpoint, batch_size: int) -> torch.Tensor:
    with torch.no_grad():
        return checkpoint.queryFromText(sentences, bsize=batch_size)#.to("cpu").numpy()

def get_query_emb_batch(sentences: List[str], checkpoint: Checkpoint, batch_size: int, batch_size2: int) -> torch.Tensor:
    embeddings_list = []
    
    for i in range(0, len(sentences), batch_size2):
        batch_sentences = sentences[i:i+batch_size2]
        with torch.no_grad():
            embeddings = torch.tensor(get_query_emb(batch_sentences, checkpoint, batch_size), dtype=torch.float32)
        embeddings_list.append(embeddings)
    
    combined_embeddings = torch.cat(embeddings_list, dim=0)
    return combined_embeddings

def vae_loss(recon_x, x, mu, log_var):
    BCE = F.binary_cross_entropy_with_logits(recon_x, x, reduction='sum')
    KLD = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())
    return BCE + KLD
def save_model(vae, siamese, optimizer, epoch, path='./models'):
    if not os.path.exists(path):
        os.makedirs(path)
    torch.save({
        'epoch': epoch,
        'vae_state_dict': vae.state_dict(),
        'siamese_state_dict': siamese.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, os.path.join(path, f'checkpoint_epoch_{epoch}.pth'))


In [2]:

ckpt_pth = "/home/sondors/Documents/ColBERT_weights/2801_lr04_bsize_210_apple/none/2024-04/18/09.16.10/checkpoints/colbert-187-finish"
experiment = "colbert-187-finish"

doc_maxlen = 300
nbits = 2
nranks = 1
kmeans_niters = 4

device = "cuda"
checkpoint = load_model(ckpt_pth, doc_maxlen, nbits, kmeans_niters, device)

pth_models = "/home/sondors/Documents/price/BERT_data/data/17-04-2024_Timofey/2801_offers_models_Apple.csv"
pth_offers = '/home/sondors/Documents/price/BERT_data/data/17-04-2024_Timofey/2801_Apple_triplets_offer_model_train.csv'



In [3]:

class MyDataset(Dataset):
    def __init__(self, offers: List[str], true_matches: List[str], false_matches: List[str], checkpoint: Checkpoint, batch_size: int, batch_size2: int):
        self.offers = offers
        self.true_matches = true_matches
        self.false_matches = false_matches
        self.checkpoint = checkpoint
        self.batch_size = batch_size
        self.batch_size2 = batch_size2

    def __len__(self):
        return len(self.offers)

    def __getitem__(self, idx):
        offer_embs = get_query_emb_batch([self.offers[idx]], self.checkpoint, batch_size=self.batch_size, batch_size2=self.batch_size2)
        true_match_embs = get_query_emb_batch([self.true_matches[idx]], self.checkpoint, batch_size=self.batch_size, batch_size2=self.batch_size2)
        false_match_embs = get_query_emb_batch([self.false_matches[idx]], self.checkpoint, batch_size=self.batch_size, batch_size2=self.batch_size2)

        y_true = torch.ones(len(true_match_embs))
        y_false = torch.zeros(len(false_match_embs))

        X = torch.cat([offer_embs, offer_embs], dim=0)
        X_pair = torch.cat([true_match_embs, false_match_embs], dim=0)
        y = torch.cat([y_true, y_false], dim=0)

        return X, X_pair, y
    
def load_data(pth_models, pth_offers, checkpoint, batch_size, batch_size2):
    id_category = {
        2801: 'мобильные телефоны'
    }
    
    df_models = pd.read_csv(pth_models, sep=';')
    df_offers = pd.read_csv(pth_offers, sep=';')
    df_offers = df_offers[:1000]

    df_models = df_models[df_models['category_id'].isin(id_category.keys())].reset_index(drop=True)
    df_offers = df_offers[df_offers['category_id'].isin(id_category.keys())].reset_index(drop=True)

    df_offers_shuffled = df_offers.sample(frac=1, random_state=42)

    test_size = int(0.15 * len(df_offers_shuffled))

    df_train = df_offers_shuffled.iloc[:-test_size]
    df_test = df_offers_shuffled.iloc[-test_size:]

    print("Размер тренировочной выборки:", len(df_train))
    print("Размер тестовой выборки:", len(df_test))

    offer_batch = list(df_train['name'])
    true_match_batch = list(df_train['true_match'])
    false_match_batch = list(df_train['false_match'])
    train_dataset = MyDataset(offer_batch, true_match_batch, false_match_batch, checkpoint, batch_size, batch_size2)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    offer_batch = list(df_test['name'])
    true_match_batch = list(df_test['true_match'])
    false_match_batch = list(df_test['false_match'])
    test_dataset = MyDataset(offer_batch, true_match_batch, false_match_batch, checkpoint, batch_size, batch_size2)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
    return train_dataloader, test_dataloader

batch_size = 100
batch_size2 = 100000#3000
train_dataloader, test_dataloader = load_data(pth_models, pth_offers, checkpoint, batch_size, batch_size2)

Размер тренировочной выборки: 850
Размер тестовой выборки: 150


In [4]:

vae = VAE().to(device)
siamese = SiameseNetwork().to(device)


epochs = 15
initial_lr = 0.01
n_batches = len(train_dataloader)

params = list(vae.parameters()) + list(siamese.parameters())
optimizer = AdamW(params, lr=initial_lr, eps=1e-8)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)  # Scheduler definition
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=epochs*n_batches)

scaler = torch.cuda.amp.GradScaler()
criterion = nn.BCEWithLogitsLoss()




In [5]:

def validate(vae, siamese, dataloader, device):
    vae.eval()
    siamese.eval()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    mean_vae_time = 0
    mean_seamese_time = 0

    all_predictions = []
    all_targets = []

    criterion = nn.BCEWithLogitsLoss()

    with torch.no_grad():
        for f, g, similarity in dataloader:
            f, g, similarity = f.to(device).reshape(-1, 32, 768), g.to(device).reshape(-1, 32, 768), similarity.to(device).reshape(-1)

            recon_f, mu_f, log_var_f = vae(f)
            recon_g, mu_g, log_var_g = vae(g)

            vae_time_start = time.time()
            loss_vae_f = vae_loss(recon_f, f, mu_f, log_var_f)
            loss_vae_g = vae_loss(recon_g, g, mu_g, log_var_g)
            mean_vae_time += time.time() - vae_time_start

            siamese_time_start = time.time()
            similarity_score = siamese(vae.reparameterize(mu_f, log_var_f), vae.reparameterize(mu_g, log_var_g))
            mean_seamese_time += time.time() - siamese_time_start

            loss_siamese = criterion(similarity_score, similarity.view(-1))

            loss = loss_vae_f + loss_vae_g + loss_siamese

            total_loss += loss.item()

            predicted_labels = (similarity_score > 0).float()
            correct_predictions += (predicted_labels == similarity.view(-1)).sum().item()
            total_samples += similarity.size(0)

            all_predictions.extend(similarity_score.detach().cpu().numpy())
            all_targets.extend(similarity.cpu().numpy())

    average_loss = total_loss / len(dataloader)
    accuracy = correct_predictions / total_samples
    roc_auc = roc_auc_score(all_targets, all_predictions)
    mean_vae_time = mean_vae_time / total_samples
    mean_seamese_time = mean_seamese_time / total_samples

    return average_loss, accuracy, roc_auc, mean_vae_time, mean_seamese_time


In [6]:
batch_size

100

In [8]:


vae.train()
siamese.train()
total_train_loss = 0

batch_time = time.time()
for batch_index, (f, g, similarity) in enumerate(train_dataloader):
    # print(f"f = {f}")
    print(f"np.shape(f) = {np.shape(f)}")
    # print(f"g = {g}")
    print(f"np.shape(g) = {np.shape(g)}")
    # print(f"similarity = {similarity}")
    print(f"np.shape(similarity) = {np.shape(similarity)}")

    f, g, similarity = f.to(device).reshape(-1, 32, 768), g.to(device).reshape(-1, 32, 768), similarity.to(device).reshape(-1)
    
    optimizer.zero_grad()
    with torch.cuda.amp.autocast():
        recon_f, mu_f, log_var_f = vae(f)
        recon_g, mu_g, log_var_g = vae(g)

        # print(f"f = {f}")
        print(f"np.shape(f) = {np.shape(f)}")
        # print(f"g = {g}")
        print(f"np.shape(g) = {np.shape(g)}")
        # print(f"similarity = {similarity}")
        print(f"np.shape(similarity) = {np.shape(similarity)}")

        loss_vae_f = vae_loss(recon_f, f, mu_f, log_var_f)
        loss_vae_g = vae_loss(recon_g, g, mu_g, log_var_g)

        similarity_score = siamese(vae.reparameterize(mu_f, log_var_f), vae.reparameterize(mu_g, log_var_g))
        loss_siamese = criterion(similarity_score, similarity.view(-1))

        total_loss = loss_vae_f + loss_vae_g + loss_siamese
        break

  embeddings = torch.tensor(get_query_emb(batch_sentences, checkpoint, batch_size), dtype=torch.float32)


np.shape(f) = torch.Size([100, 2, 32, 768])
np.shape(g) = torch.Size([100, 2, 32, 768])
np.shape(similarity) = torch.Size([100, 2])
np.shape(f) = torch.Size([200, 32, 768])
np.shape(g) = torch.Size([200, 32, 768])
np.shape(similarity) = torch.Size([200])


In [27]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

# class SiameseNetwork(nn.Module):
#     def __init__(self, latent_dim=768):
#         super(SiameseNetwork, self).__init__()
#         self.fc1 = nn.Linear(latent_dim, 64)
#         self.fc2 = nn.Linear(64, 32)
#         self.fc3 = nn.Linear(32, 1)
#         self.fc4 = nn.Linear(32, 1)
#         # self.fc3 = nn.Linear(1)

#     def forward(self, z1, z2):
#         h1 = F.relu(self.fc1(z1))
#         h2 = F.relu(self.fc1(z2))
#         diff = torch.abs(h1 - h2)
#         token_by_token = self.fc2(diff)
#         all_tokens_together = self.fc3(token_by_token.squeeze(1))
#         # return out
#         return self.fc4(all_tokens_together)#.squeeze(1)  # Ensure this line produces a shape of [batch_size]
#         # return F.sigmoid(self.fc3(out).squeeze(1))  # Ensure this line produces a shape of [batch_size]
    
class SiameseNetwork(nn.Module):
    def __init__(self, latent_dim=768):
        super(SiameseNetwork, self).__init__()
        self.fc1 = nn.Linear(latent_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)

    def forward(self, z1, z2):
        print(f"z1 shape (input): {z1.shape}")  # [batch_size, sequence_length, embedding_dim]
        h1 = F.relu(self.fc1(z1))
        print(f"h1 shape: {h1.shape}")  # [batch_size, sequence_length, 64]
        
        print(f"z2 shape (input): {z2.shape}")  # [batch_size, sequence_length, embedding_dim]
        h2 = F.relu(self.fc1(z2))
        print(f"h2 shape: {h2.shape}")  # [batch_size, sequence_length, 64]
        
        diff = torch.abs(h1 - h2)
        print(f"diff shape: {diff.shape}")  # [batch_size, sequence_length, 64]
        
        out = F.relu(self.fc2(diff))
        print(f"out shape (after fc2): {out.shape}")  # [batch_size, sequence_length, 32]
        
        out = self.fc3(out)
        print(f"out shape (after fc3): {out.shape}")  # [batch_size, sequence_length, 1]
        
        out = out.squeeze(2)  # Squeeze the last dimension
        print(f"out shape (final output): {out.shape}")  # [batch_size, sequence_length]
        return out
    
# Инициализация модели, функции потерь и оптимизатора
similarity_model = SiameseNetwork().to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(similarity_model.parameters(), lr=0.001)


# Обучающий цикл
num_epochs = 10
for epoch in range(num_epochs):
    for batch_index, (f, g, similarity) in enumerate(train_dataloader):
        # print(f"f = {f}")
        # print(f"np.shape(f) = {np.shape(f)}")
        # # print(f"g = {g}")
        # print(f"np.shape(g) = {np.shape(g)}")
        # # print(f"similarity = {similarity}")
        # print(f"np.shape(similarity) = {np.shape(similarity)}")

        f, g, similarity = f.to(device).reshape(-1, 32, 768), g.to(device).reshape(-1, 32, 768), similarity.to(device).reshape(-1)
        # print(f"f = {f}")
        # print(f"np.shape(f) = {np.shape(f)}")
        # # print(f"g = {g}")
        # print(f"np.shape(g) = {np.shape(g)}")
        # # print(f"similarity = {similarity}")
        # print(f"np.shape(similarity) = {np.shape(similarity)}")

        optimizer.zero_grad()
        outputs = similarity_model(f, g)
        # print(f"outputs = {outputs}")
        print(f"np.shape(outputs) = {np.shape(outputs)}")
        loss = criterion(outputs, similarity)
        loss.backward()
        optimizer.step()
        break
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

    break

  embeddings = torch.tensor(get_query_emb(batch_sentences, checkpoint, batch_size), dtype=torch.float32)


z1 shape (input): torch.Size([200, 32, 768])
h1 shape: torch.Size([200, 32, 64])
z2 shape (input): torch.Size([200, 32, 768])
h2 shape: torch.Size([200, 32, 64])
diff shape: torch.Size([200, 32, 64])
out shape (after fc2): torch.Size([200, 32, 32])
out shape (after fc3): torch.Size([200, 32, 1])
out shape (final output): torch.Size([200, 32])
np.shape(outputs) = torch.Size([200, 32])
