In [None]:
import pandas as pd

df = pd.read_csv("dataset.csv")
print(df.head())


                                   sentence          labels
0           O, qırmızı, böyük maşını sürdü.  un ad ad nn un
1  Bu, isti, qəhvə, səhərimi gözəlləşdirdi.  un ad nn un un
2        Bu, kiçik, taxta qutu, xəzinəmdir.  un ad ad nn nn
3          O, sakit, dəniz sahilində gəzdi.  un ad nn un un
4        Böyük, dəmir qapı, qalanı qoruyur.  ad ad nn nn un


In [42]:
from sklearn.model_selection import train_test_split

def tokenize(sentence):
    return sentence.replace('"', '').replace('.', '').replace(',', '').replace('?', '').split()

df['tokens'] = df['sentence'].apply(tokenize)
df['pos'] = df['labels'].apply(lambda x: x.split())

all_tokens = [token for sentence in df['tokens'] for token in sentence]
all_tags = [tag for tags in df['pos'] for tag in tags]

word2idx = {word: i+2 for i, word in enumerate(set(all_tokens))}
word2idx["<PAD>"] = 0
word2idx["<UNK>"] = 1

tag2idx = {tag: i for i, tag in enumerate(set(all_tags))}
idx2tag = {i: tag for tag, i in tag2idx.items()}

df['token_ids'] = df['tokens'].apply(lambda x: [word2idx.get(word, 1) for word in x])
df['tag_ids'] = df['pos'].apply(lambda x: [tag2idx[tag] for tag in x])

train_data, val_data = train_test_split(df, test_size=0.2, random_state=42)


In [43]:
import torch
from torch.utils.data import Dataset, DataLoader

class PosDataset(Dataset):
    def __init__(self, data, max_len=20):
        self.data = data
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tokens = self.data.iloc[idx]['token_ids']
        tags = self.data.iloc[idx]['tag_ids']

        pad_len = self.max_len - len(tokens)
        tokens = tokens + [0]*pad_len
        tags = tags + [-1]*pad_len

        return torch.tensor(tokens), torch.tensor(tags)

train_dataset = PosDataset(train_data)
val_dataset = PosDataset(val_data)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)


In [44]:
import torch.nn as nn

class BiLSTMTagger(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim=64, hidden_dim=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=1, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim, tagset_size)

    def forward(self, x):
        emb = self.embedding(x)
        lstm_out, _ = self.lstm(emb)
        tag_space = self.fc(lstm_out)
        return tag_space


In [45]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiLSTMTagger(len(word2idx), len(tag2idx)).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=-1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(10):
    model.train()
    total_loss = 0
    for tokens, tags in train_loader:
        tokens, tags = tokens.to(device), tags.to(device)

        optimizer.zero_grad()
        outputs = model(tokens)
        outputs = outputs.view(-1, len(tag2idx))
        tags = tags.view(-1)

        loss = criterion(outputs, tags)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


Epoch 1, Loss: 660.4577
Epoch 2, Loss: 300.7637
Epoch 3, Loss: 228.2632
Epoch 4, Loss: 184.7726
Epoch 5, Loss: 151.9566
Epoch 6, Loss: 125.0630
Epoch 7, Loss: 103.1242
Epoch 8, Loss: 85.1534
Epoch 9, Loss: 70.1089
Epoch 10, Loss: 57.4520


In [46]:
def predict(model, sentence):
    model.eval()
    tokens = tokenize(sentence)
    token_ids = [word2idx.get(tok, 1) for tok in tokens]
    padded = token_ids + [0]*(20 - len(token_ids))
    input_tensor = torch.tensor([padded]).to(device)

    with torch.no_grad():
        output = model(input_tensor)
    preds = torch.argmax(output, dim=-1)[0][:len(tokens)]
    return list(zip(tokens, [idx2tag[int(i)] for i in preds]))

In [65]:
print(predict(model, "Vaxtımız azdır, bu məsələni bitirək."))
print(predict(model, "Kompüterimdə problemlər var."))
print(predict(model, "Sabah dərsə getməliyik."))
print(predict(model, "Bu işi xoşlamıram."))

[('Vaxtımız', 'nn'), ('azdır', 'ad'), ('bu', 'un'), ('məsələni', 'nn'), ('bitirək', 'un')]
[('Kompüterimdə', 'nn'), ('problemlər', 'nn'), ('var', 'un')]
[('Sabah', 'un'), ('dərsə', 'nn'), ('getməliyik', 'un')]
[('Bu', 'un'), ('işi', 'nn'), ('xoşlamıram', 'un')]
