In [2]:
!pip install torch torchvision
!pip install conllu

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [33]:
criterion = nn.CrossEntropyLoss(ignore_index=-1)

In [34]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

class DependencyParser(nn.Module):
    def __init__(self, vocab_size, pos_size, embedding_dim, hidden_dim, output_dim):
        super(DependencyParser, self).__init__()
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.pos_embeddings = nn.Embedding(pos_size, embedding_dim)
        self.fc1 = nn.Linear(2 * embedding_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim) #SHIFT, LEFT-ARC, RIGHT-ARC

    def forward(self, x):
        word_embeds = self.word_embeddings(x[:, :, 0])
        pos_embeds = self.pos_embeddings(x[:, :, 1])
        embeds = torch.cat((word_embeds, pos_embeds), dim=2)
        x = self.fc1(embeds)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Model and DataLoader Setup
vocab_size = len(train_dataset.word_index) + 1
pos_size = len(train_dataset.pos_index) + 1
model = DependencyParser(vocab_size, pos_size, embedding_dim=100, hidden_dim=200, output_dim=3)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=-1)

# Training Function
def train(model, train_loader, optimizer, criterion, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for features, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(features)
            outputs = outputs.transpose(1, 2)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}')

train(model, train_loader, optimizer, criterion)


Epoch 1, Loss: 0.016724249358661354
Epoch 2, Loss: 8.294955272021424e-05
Epoch 3, Loss: 2.9474748474485193e-05
Epoch 4, Loss: 1.4661742574278425e-05
Epoch 5, Loss: 8.548370660765632e-06
Epoch 6, Loss: 5.451718092727992e-06
Epoch 7, Loss: 3.6755631188089883e-06
Epoch 8, Loss: 2.575452059000336e-06
Epoch 9, Loss: 1.8546004304894547e-06
Epoch 10, Loss: 1.372853632534891e-06


In [35]:
train_dataset = DependencyParsingDataset('/content/en_ewt-ud-train.conllu')
dev_dataset = DependencyParsingDataset('/content/en_ewt-ud-dev.conllu')

In [37]:
import torch
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
from conllu import parse_incr
import os

class DependencyParsingDataset(Dataset):
    def __init__(self, file_path):
        self.vocab = defaultdict(lambda: len(self.vocab))  # Proper initialization
        self.tag_vocab = defaultdict(lambda: len(self.tag_vocab))  # Proper initialization
        self.data = self.load_data(file_path)

    def load_data(self, file_path):
        data = []
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"The file {file_path} does not exist.")
        with open(file_path, 'r', encoding='utf-8') as file:
            for tokenlist in parse_incr(file):
                indexed_tokens = [
                    (self.vocab[token['form'].lower()], self.tag_vocab[token['upostag']])
                    for token in tokenlist if 'form' in token and 'upostag' in token
                ]
                data.append((indexed_tokens, []))  # Second element is placeholder for targets
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Ensure that the file path is correct and the .conllu file is accessible
train_dataset = DependencyParsingDataset('/content/en_ewt-ud-train.conllu')
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

print("Loaded training dataset with", len(train_dataset), "items.")

Training Dataset Samples:
Sample 1 - Features: tensor([[ 0,  0],
        [ 1,  1],
        [ 2,  0],
        [ 3,  1],
        [ 4,  2],
        [ 5,  3],
        [ 6,  4],
        [ 7,  0],
        [ 8,  0],
        [ 0,  0],
        [ 1,  1],
        [ 9,  0],
        [10,  1],
        [11,  5],
        [12,  3],
        [13,  6],
        [11,  5],
        [14,  3],
        [15,  6],
        [11,  5],
        [16,  3],
        [17,  6],
        [18,  0],
        [10,  1],
        [19,  6],
        [11,  5],
        [20,  2],
        [21,  3],
        [22,  1]]), Labels: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0])
Sample 2 - Features: tensor([[23,  1],
        [24,  5],
        [25,  3],
        [17,  6],
        [26,  5],
        [27,  2],
        [28,  3],
        [29,  7],
        [30,  7],
        [31,  4],
        [32,  8],
        [33,  3],
        [34,  6],
        [35,  3],
        [36,  9],
        [37,  4],
        