In [2]:
from transformers import BertModel, BertTokenizer, AutoTokenizer, AutoModel
# load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")
model = AutoModel.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")
# tokenize a sentence and run through the model


tokenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<00:00, 2.62kB/s]
config.json: 100%|██████████| 385/385 [00:00<00:00, 32.9kB/s]
vocab.txt: 100%|██████████| 397k/397k [00:00<00:00, 1.78MB/s]
pytorch_model.bin: 100%|██████████| 500M/500M [00:55<00:00, 8.99MB/s] 


NameError: name 'torch' is not defined

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np

df = pd.read_csv("test.csv")


tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")
model = AutoModel.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")


class SatireDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        title = self.data.iloc[idx]['title']
        content = self.data.iloc[idx]['content']
        label = self.data.iloc[idx]['label']
        
        # Tokenize both title and content
        encoding_title = self.tokenizer(title, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        encoding_content = self.tokenizer(content, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        
        return {
            'input_ids_title': encoding_title['input_ids'].squeeze(0),
            'attention_mask_title': encoding_title['attention_mask'].squeeze(0),
            'input_ids_content': encoding_content['input_ids'].squeeze(0),
            'attention_mask_content': encoding_content['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.float)
        }

# Define the Siamese Network
class SiameseNetwork(nn.Module):
    def __init__(self, model):
        super(SiameseNetwork, self).__init__()
        self.bert = model  # Pre-trained BERT model
        self.fc = nn.Linear(768, 1)  # Binary classification (0 or 1)

    def forward(self, input_ids_1, attention_mask_1, input_ids_2, attention_mask_2):
        # Get BERT embeddings for both inputs
        output_1 = self.bert(input_ids_1, attention_mask=attention_mask_1)
        output_2 = self.bert(input_ids_2, attention_mask=attention_mask_2)
        
        # Use the [CLS] token's output (first token's hidden state)
        emb_1 = output_1.last_hidden_state[:, 0, :]  # First token (CLS) embedding for the title
        emb_2 = output_2.last_hidden_state[:, 0, :]  # First token (CLS) embedding for the content
        
        # Compute cosine similarity between the two embeddings
        cosine_sim = torch.nn.functional.cosine_similarity(emb_1, emb_2)
        
        # Output prediction using a fully connected layer
        output = self.fc(cosine_sim.unsqueeze(1))  # [batch_size, 1]
        
        return torch.sigmoid(output)  # Sigmoid activation for binary classification

# Initialize dataset and dataloaders
dataset = SatireDataset(df, tokenizer)
train_dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Initialize the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
siamese_model = SiameseNetwork(model)
siamese_model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Loss function and optimizer
criterion = nn.BCEWithLogitsLoss()  # Binary cross entropy loss
optimizer = optim.Adam(siamese_model.parameters(), lr=1e-5)

# Number of epochs
epochs = 3

# Training loop
for epoch in range(epochs):
    siamese_model.train()  # Set model to training mode
    running_loss = 0.0
    all_preds = []
    all_labels = []
    
    for batch in train_dataloader:
        # Move data to GPU if available
        input_ids_title = batch['input_ids_title'].to(device)
        attention_mask_title = batch['attention_mask_title'].to(device)
        input_ids_content = batch['input_ids_content'].to(device)
        attention_mask_content = batch['attention_mask_content'].to(device)
        labels = batch['label'].to(device)
        
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        output = siamese_model(input_ids_title, attention_mask_title, input_ids_content, attention_mask_content)
        
        # Compute the loss
        loss = criterion(output.squeeze(1), labels)
        
        # Backward pass
        loss.backward()
        
        # Update weights
        optimizer.step()
        
        # Track loss and predictions
        running_loss += loss.item()
        all_preds.extend(output.squeeze().cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    
    # Compute training accuracy and loss for this epoch
    epoch_loss = running_loss / len(train_dataloader)
    all_preds = np.array(all_preds) > 0.5  # Convert to binary predictions (0 or 1)
    all_labels = np.array(all_labels)
    epoch_accuracy = accuracy_score(all_labels, all_preds)
    
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy*100:.2f}%")
    
# Save the trained model
torch.save(siamese_model.state_dict(), "siamese_model.pth")


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import torch


input_ids = torch.tensor(tokenizer.encode("Acesta este un test.", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
outputs = model(input_ids)
# get encoding
last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
print(last_hidden_states)

tensor([[[ 0.4894, -0.6289,  0.4323,  ...,  0.5619,  0.1866,  0.0627],
         [ 1.3910, -0.1289, -0.6514,  ...,  1.0870, -1.3180, -0.4085],
         [ 0.4701,  0.0223,  0.8850,  ...,  1.4155, -0.0656, -0.5369],
         ...,
         [-0.0217, -0.7029,  0.0935,  ...,  1.1168,  0.4694, -0.6496],
         [ 0.5665, -0.0897,  0.4867,  ...,  0.4900,  0.6538, -0.4164],
         [ 0.9089, -0.3590,  0.4672,  ...,  1.0321,  0.7096, -0.5094]]],
       grad_fn=<NativeLayerNormBackward0>)
