# Import toolkit


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir("/content/drive/MyDrive/2_PBL7/")

In [None]:
!pip install torch transformers networkx node2vec

Collecting node2vec
  Downloading node2vec-0.4.6-py3-none-any.whl (7.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidi

In [8]:
import torch
from transformers import BertTokenizer, BertModel
import networkx as nx
from node2vec import Node2Vec
import numpy as np
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence

# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Dummy dataset class
class KeyphraseDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
        inputs['labels'] = torch.tensor(label)
        return inputs

# Example data
texts = ["This is a sample document.", "Another document for keyphrase extraction."]
labels = [[0, 0, 0, 1, 0], [0, 0, 0, 1, 1]]  # B-keyphrase: 1, I-keyphrase: 1, O: 0

dataset = KeyphraseDataset(texts, labels)

def collate_fn(batch):
    input_ids = pad_sequence([item['input_ids'].squeeze(0) for item in batch], batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence([item['attention_mask'].squeeze(0) for item in batch], batch_first=True, padding_value=0)
    labels = pad_sequence([item['labels'] for item in batch], batch_first=True, padding_value=-100)
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

dataloader = DataLoader(dataset, batch_size=2, collate_fn=collate_fn)

# Define the model
class Phraseformer(nn.Module):
    def __init__(self, bert_model, graph_embedding_dim, num_labels):
        super(Phraseformer, self).__init__()
        self.bert = bert_model
        self.graph_embedding_dim = graph_embedding_dim
        self.num_labels = num_labels
        self.fc = nn.Linear(bert_model.config.hidden_size + graph_embedding_dim, num_labels)

    def forward(self, input_ids, attention_mask, graph_embeddings):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        combined = torch.cat((sequence_output, graph_embeddings), dim=-1)
        logits = self.fc(combined)
        return logits

# Initialize the model
model = Phraseformer(bert_model, graph_embedding_dim=128, num_labels=2)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

# Training loop
for epoch in range(10):  # Number of epochs
    for batch in dataloader:
        inputs = batch
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        labels = inputs['labels']

        # Dummy graph embeddings (should be computed based on your graph)
        batch_size, seq_len = input_ids.size()
        graph_embeddings = torch.randn(batch_size, seq_len, 128)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, graph_embeddings)

        # Compute loss, ignoring the padding tokens
        active_loss = attention_mask.view(-1) == 1
        active_logits = torch.masked_select(outputs.view(-1, model.num_labels), active_loss.unsqueeze(-1)).view(-1, model.num_labels)
        active_labels = torch.masked_select(labels.view(-1), active_loss)

        loss = criterion(active_logits, active_labels)
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

print("Training complete")


RuntimeError: The size of tensor a (20) must match the size of tensor b (10) at non-singleton dimension 0