In [None]:
!pip install torch torchtext torchcrf pytorch-crf

In [None]:
#!pip install datasets
from datasets import load_dataset, load_from_disk

In [None]:
dataset = load_from_disk('/content/wikiann_tr')

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torchcrf import CRF

devide = torch.device("cuda")

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torchcrf import CRF



class NERDataset(Dataset):
    def __init__(self, texts, labels, vocab, tag_to_idx):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.tag_to_idx = tag_to_idx

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = [self.vocab.get(word, self.vocab['<UNK>']) for word in self.texts[idx].split()]
        labels = [self.tag_to_idx[tag] for tag in self.labels[idx]]
        return torch.tensor(text, dtype=torch.long), torch.tensor(labels, dtype=torch.long)

class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tag_to_idx, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_idx = tag_to_idx
        self.tagset_size = len(tag_to_idx)

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=2, bidirectional=True, dropout=0.5)
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
        self.crf = CRF(self.tagset_size, batch_first=True)

    def forward(self, sentence, tags=None):
        embedded = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embedded)
        tag_scores = self.hidden2tag(lstm_out)

        if tags is not None:
            mask = (sentence != 0).bool()
            loss = -self.crf(tag_scores, tags, mask=mask)
            return loss
        else:
            best_tags = self.crf.decode(tag_scores)
            return best_tags

def prepare_data(texts, labels):
    vocab = {'<PAD>': 0, '<UNK>': 1}
    tag_to_idx = {'O': 0, 'entity': 1}

    for text in texts:
        for word in text.split():
            if word not in vocab:
                vocab[word] = len(vocab)

    return vocab, tag_to_idx

def collate_fn(batch):
    texts, labels = zip(*batch)
    max_len = max(len(x) for x in texts)
    padded_texts = [torch.cat([x, torch.zeros(max_len - len(x), dtype=torch.long)]) for x in texts]
    padded_labels = [torch.cat([y, torch.zeros(max_len - len(y), dtype=torch.long)]) for y in labels]
    return torch.stack(padded_texts), torch.stack(padded_labels)

texts = [
    "Ahmet Yılmaz 15 Mart 1990 tarihinde doğdu ve şu an 33 yaşında",
    "istanbul'un nüfusu yaklaşık 15 milyon kisidir",
    "Mustafa Kemal Atatürk 29 Ekim 1923'te Türkiye Cumhuriyeti'ni kurdu",
    "Entity X'in müşteri hizmetleri hızlı ve etkili Entity Y'nin ürün kalitesi çok kötü",
    "Fiber 100mb SuperOnline kullanıcısıyım yaklaşık 2 haftadır @Twitch @Kick_turkey gibi canlı yayın platformlarında 360p yayın izlerken donmalar yaşıyoruz başka hiç bir operatörler bu sorunu yaşamazken ben parasını verip alamadığım hizmeti neden ödeyeyim @Turkcell"
] + text_dataset

labels = [
    ['entity', 'entity', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
    ['entity', 'O', 'O', 'O', 'O','O'],
    ['entity', 'entity', 'entity', 'O', 'O', 'O', 'O', 'O', 'O'],
    ['entity', 'entity', 'O', 'O', 'O', 'O', 'O', 'entity', 'entity', 'O','O','O','O'],
    ['O','O','entity','O','O','O','O','entity','entity','O','O','O','O','O','O','O','O','O','entity','entity','entity','entity','O','O','O','O','O','O','O','O','O','O','entity']
] + label_dataset




# Check and align lengths of texts and labels
for i in range(len(texts)):
    text_length = len(texts[i].split())
    label_length = len(labels[i])
    if text_length != label_length:
        raise ValueError(f"Text and label lengths do not match at index {i}: {text_length} != {label_length}")

vocab, tag_to_idx = prepare_data(texts, labels)
dataset = NERDataset(texts, labels, vocab, tag_to_idx)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

model = BiLSTM_CRF(len(vocab), tag_to_idx, embedding_dim=100, hidden_dim=200)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

num_epochs = 300
for epoch in range(num_epochs):
    total_loss = 0
    for inputs, targets in dataloader:
        inputs = inputs.to(device)
        targets = targets.to(device)
        optimizer.zero_grad()
        loss = model(inputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader):.4f}")

In [None]:
!pip install neattext

In [None]:
pip install pytorch-crf

In [None]:
from datasets import load_dataset, load_from_disk


# Later, load the dataset from disk
dataset = load_dataset("wikiann", "tr")

# Verify that the dataset is loaded correctly
print(dataset)


In [None]:
import pandas as pd

In [None]:
print(dataset["train"][0])

In [None]:
ner_encoding = {0: "O", 1: "entity", 2: "entity", 3: "entity", 4: "entity", 5: "entity", 6: "entity"}


train_tokens = []
train_tags = []
for sample in dataset["train"]:
  train_tokens.append(' '.join(sample["tokens"]))
  train_tags.append(' '.join([ner_encoding[a] for a in sample["ner_tags"]]))

test_tokens = []
test_tags = []
for sample in dataset["train"]:
  test_tokens.append(' '.join(sample["tokens"]))
  test_tags.append(' '.join([ner_encoding[a] for a in sample["ner_tags"]]))

df_train = pd.DataFrame({"sentence": train_tokens, "tags": train_tags})
df_test = pd.DataFrame({"sentence": test_tokens, "tags": test_tags})

In [None]:
df_train.head()

In [None]:
ner_encoding = {0: "O", 1: "entity", 2: "entity", 3: "entity", 4: "entity", 5: "entity", 6: "entity"}

text_dataset = []
for sample in dataset["train"]:
  text_dataset.append(' '.join(sample["tokens"]))
for sample in dataset["test"]:
  text_dataset.append(' '.join(sample["tokens"]))

label_dataset = []
for sample in dataset["train"]:
  label_dataset.append([ner_encoding[a] for a in sample["ner_tags"]])
for sample in dataset["test"]:
  label_dataset.append([ner_encoding[a] for a in sample["ner_tags"]])

In [None]:
text_dataset

In [None]:
pip install TorchCRF

In [None]:
pip install torchcrf

In [None]:
from TorchCRF import CRF

In [None]:
device = torch.device("cuda")

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torchcrf import CRF

device = torch.device("cuda")

class NERDataset(Dataset):
    def __init__(self, texts, labels, vocab, tag_to_idx):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.tag_to_idx = tag_to_idx

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = [self.vocab.get(word, self.vocab['<UNK>']) for word in self.texts[idx].split()]
        labels = [self.tag_to_idx[tag] for tag in self.labels[idx]]
        return torch.tensor(text, dtype=torch.long), torch.tensor(labels, dtype=torch.long)

class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tag_to_idx, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_idx = tag_to_idx
        self.tagset_size = len(tag_to_idx)

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=2, bidirectional=True, dropout=0.5)
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
        self.crf = CRF(self.tagset_size, batch_first=True)

    def forward(self, sentence, tags=None):
        embedded = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embedded)
        tag_scores = self.hidden2tag(lstm_out)

        if tags is not None:
            mask = (sentence != 0).bool()
            loss = -self.crf(tag_scores, tags, mask=mask)
            return loss
        else:
            best_tags = self.crf.decode(tag_scores)
            return best_tags

def prepare_data(texts, labels):
    vocab = {'<PAD>': 0, '<UNK>': 1}
    tag_to_idx = {'O': 0, 'entity': 1}

    for text in texts:
        for word in text.split():
            if word not in vocab:
                vocab[word] = len(vocab)

    return vocab, tag_to_idx

def collate_fn(batch):
    texts, labels = zip(*batch)
    max_len = max(len(x) for x in texts)
    padded_texts = [torch.cat([x, torch.zeros(max_len - len(x), dtype=torch.long)]) for x in texts]
    padded_labels = [torch.cat([y, torch.zeros(max_len - len(y), dtype=torch.long)]) for y in labels]
    return torch.stack(padded_texts), torch.stack(padded_labels)

texts = [
    "Ahmet Yılmaz 15 Mart 1990 tarihinde doğdu ve şu an 33 yaşında",
    "istanbul'un nüfusu yaklaşık 15 milyon kisidir",
    "Mustafa Kemal Atatürk 29 Ekim 1923'te Türkiye Cumhuriyeti'ni kurdu",
    "Entity X'in müşteri hizmetleri hızlı ve etkili Entity Y'nin ürün kalitesi çok kötü",
    "Fiber 100mb SuperOnline kullanıcısıyım yaklaşık 2 haftadır @Twitch @Kick_turkey gibi canlı yayın platformlarında 360p yayın izlerken donmalar yaşıyoruz başka hiç bir operatörler bu sorunu yaşamazken ben parasını verip alamadığım hizmeti neden ödeyeyim @Turkcell"
] + text_dataset

labels = [
    ['entity', 'entity', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
    ['entity', 'O', 'O', 'O', 'O','O'],
    ['entity', 'entity', 'entity', 'O', 'O', 'O', 'O', 'O', 'O'],
    ['entity', 'entity', 'O', 'O', 'O', 'O', 'O', 'entity', 'entity', 'O','O','O','O'],
    ['O','O','entity','O','O','O','O','entity','entity','O','O','O','O','O','O','O','O','O','entity','entity','entity','entity','O','O','O','O','O','O','O','O','O','O','entity']
] + label_dataset




# Check and align lengths of texts and labels
for i in range(len(texts)):
    text_length = len(texts[i].split())
    label_length = len(labels[i])
    if text_length != label_length:
        raise ValueError(f"Text and label lengths do not match at index {i}: {text_length} != {label_length}")

vocab, tag_to_idx = prepare_data(texts, labels)
dataset = NERDataset(texts, labels, vocab, tag_to_idx)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

model = BiLSTM_CRF(len(vocab), tag_to_idx, embedding_dim=300, hidden_dim=600)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

num_epochs = 3000
for epoch in range(num_epochs):
    total_loss = 0
    for inputs, targets in dataloader:
        inputs = inputs.to(device)
        targets = targets.to(device)
        optimizer.zero_grad()
        loss = model(inputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader):.4f}")

In [17]:
test_sentence = "benim şirketim kazanacaktır onun adı da turkcell"
test_tensor = torch.tensor([vocab.get(word, vocab['<UNK>']) for word in test_sentence.split()], dtype=torch.long)
test_tensor = test_tensor.to(device)
with torch.no_grad():
    best_tags = model(test_tensor.unsqueeze(0))[0]

best_tags
idx_to_tag = {i: tag for tag, i in tag_to_idx.items()}
predicted_labels = [idx_to_tag[i] for i in best_tags]
print("Test cümlesi:", test_sentence)
print("Tahmin edilen etiketler:", predicted_labels)

NameError: name 'vocab' is not defined

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torchcrf import CRF

device = torch.device("cuda")

class NERDataset(Dataset):
    def __init__(self, texts, labels, vocab, tag_to_idx):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.tag_to_idx = tag_to_idx

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = [self.vocab.get(word, self.vocab['<UNK>']) for word in self.texts[idx].split()]
        labels = [self.tag_to_idx[tag] for tag in self.labels[idx]]
        return torch.tensor(text, dtype=torch.long), torch.tensor(labels, dtype=torch.long)

class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tag_to_idx, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_idx = tag_to_idx
        self.tagset_size = len(tag_to_idx)

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=2, bidirectional=True, dropout=0.5)
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
        self.crf = CRF(self.tagset_size, batch_first=True)

    def forward(self, sentence, tags=None):
        embedded = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embedded)
        tag_scores = self.hidden2tag(lstm_out)

        if tags is not None:
            mask = (sentence != 0).bool()
            loss = -self.crf(tag_scores, tags, mask=mask)
            return loss
        else:
            best_tags = self.crf.decode(tag_scores)
            return best_tags

def prepare_data(texts, labels):
    vocab = {'<PAD>': 0, '<UNK>': 1}
    tag_to_idx = {'O': 0, 'entity': 1}

    for text in texts:
        for word in text.split():
            if word not in vocab:
                vocab[word] = len(vocab)

    return vocab, tag_to_idx

def collate_fn(batch):
    texts, labels = zip(*batch)
    max_len = max(len(x) for x in texts)
    padded_texts = [torch.cat([x, torch.zeros(max_len - len(x), dtype=torch.long)]) for x in texts]
    padded_labels = [torch.cat([y, torch.zeros(max_len - len(y), dtype=torch.long)]) for y in labels]
    return torch.stack(padded_texts), torch.stack(padded_labels)

texts = [
    "Ahmet Yılmaz 15 Mart 1990 tarihinde doğdu ve şu an 33 yaşında",
    "istanbul'un nüfusu yaklaşık 15 milyon kisidir",
    "Mustafa Kemal Atatürk 29 Ekim 1923'te Türkiye Cumhuriyeti'ni kurdu",
    "Entity X'in müşteri hizmetleri hızlı ve etkili Entity Y'nin ürün kalitesi çok kötü",
    "Fiber 100mb SuperOnline kullanıcısıyım yaklaşık 2 haftadır @Twitch @Kick_turkey gibi canlı yayın platformlarında 360p yayın izlerken donmalar yaşıyoruz başka hiç bir operatörler bu sorunu yaşamazken ben parasını verip alamadığım hizmeti neden ödeyeyim @Turkcell"
] + text_dataset

labels = [
    ['entity', 'entity', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
    ['entity', 'O', 'O', 'O', 'O','O'],
    ['entity', 'entity', 'entity', 'O', 'O', 'O', 'O', 'O', 'O'],
    ['entity', 'entity', 'O', 'O', 'O', 'O', 'O', 'entity', 'entity', 'O','O','O','O'],
    ['O','O','entity','O','O','O','O','entity','entity','O','O','O','O','O','O','O','O','O','entity','entity','entity','entity','O','O','O','O','O','O','O','O','O','O','entity']
] + label_dataset




# Check and align lengths of texts and labels
for i in range(len(texts)):
    text_length = len(texts[i].split())
    label_length = len(labels[i])
    if text_length != label_length:
        raise ValueError(f"Text and label lengths do not match at index {i}: {text_length} != {label_length}")


In [18]:
ner_encoding = {0: "O", 1: "entity", 2: "entity", 3: "entity", 4: "entity", 5: "entity", 6: "entity"}

text_dataset = []
for sample in dataset["train"]:
  text_dataset.append(' '.join(sample["tokens"]))
for sample in dataset["test"]:
  text_dataset.append(' '.join(sample["tokens"]))

label_dataset = []
for sample in dataset["train"]:
  label_dataset.append([ner_encoding[a] for a in sample["ner_tags"]])
for sample in dataset["test"]:
  label_dataset.append([ner_encoding[a] for a in sample["ner_tags"]])

NameError: name 'dataset' is not defined

In [None]:
from datasets import load_dataset, load_from_disk


# Later, load the dataset from disk
dataset = load_dataset("wikiann", "tr")

# Verify that the dataset is loaded correctly
print(dataset)


In [None]:
ner_encoding = {0: "O", 1: "entity", 2: "entity", 3: "entity", 4: "entity", 5: "entity", 6: "entity"}

text_dataset = []
for sample in dataset["train"]:
  text_dataset.append(' '.join(sample["tokens"]))
for sample in dataset["test"]:
  text_dataset.append(' '.join(sample["tokens"]))

label_dataset = []
for sample in dataset["train"]:
  label_dataset.append([ner_encoding[a] for a in sample["ner_tags"]])
for sample in dataset["test"]:
  label_dataset.append([ner_encoding[a] for a in sample["ner_tags"]])

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torchcrf import CRF

device = torch.device("cuda")

class NERDataset(Dataset):
    def __init__(self, texts, labels, vocab, tag_to_idx):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.tag_to_idx = tag_to_idx

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = [self.vocab.get(word, self.vocab['<UNK>']) for word in self.texts[idx].split()]
        labels = [self.tag_to_idx[tag] for tag in self.labels[idx]]
        return torch.tensor(text, dtype=torch.long), torch.tensor(labels, dtype=torch.long)

class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tag_to_idx, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_idx = tag_to_idx
        self.tagset_size = len(tag_to_idx)

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=2, bidirectional=True, dropout=0.5)
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
        self.crf = CRF(self.tagset_size, batch_first=True)

    def forward(self, sentence, tags=None):
        embedded = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embedded)
        tag_scores = self.hidden2tag(lstm_out)

        if tags is not None:
            mask = (sentence != 0).bool()
            loss = -self.crf(tag_scores, tags, mask=mask)
            return loss
        else:
            best_tags = self.crf.decode(tag_scores)
            return best_tags

def prepare_data(texts, labels):
    vocab = {'<PAD>': 0, '<UNK>': 1}
    tag_to_idx = {'O': 0, 'entity': 1}

    for text in texts:
        for word in text.split():
            if word not in vocab:
                vocab[word] = len(vocab)

    return vocab, tag_to_idx

def collate_fn(batch):
    texts, labels = zip(*batch)
    max_len = max(len(x) for x in texts)
    padded_texts = [torch.cat([x, torch.zeros(max_len - len(x), dtype=torch.long)]) for x in texts]
    padded_labels = [torch.cat([y, torch.zeros(max_len - len(y), dtype=torch.long)]) for y in labels]
    return torch.stack(padded_texts), torch.stack(padded_labels)

texts = [
    "Ahmet Yılmaz 15 Mart 1990 tarihinde doğdu ve şu an 33 yaşında",
    "istanbul'un nüfusu yaklaşık 15 milyon kisidir",
    "Mustafa Kemal Atatürk 29 Ekim 1923'te Türkiye Cumhuriyeti'ni kurdu",
    "Entity X'in müşteri hizmetleri hızlı ve etkili Entity Y'nin ürün kalitesi çok kötü",
    "Fiber 100mb SuperOnline kullanıcısıyım yaklaşık 2 haftadır @Twitch @Kick_turkey gibi canlı yayın platformlarında 360p yayın izlerken donmalar yaşıyoruz başka hiç bir operatörler bu sorunu yaşamazken ben parasını verip alamadığım hizmeti neden ödeyeyim @Turkcell"
] + text_dataset

labels = [
    ['entity', 'entity', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
    ['entity', 'O', 'O', 'O', 'O','O'],
    ['entity', 'entity', 'entity', 'O', 'O', 'O', 'O', 'O', 'O'],
    ['entity', 'entity', 'O', 'O', 'O', 'O', 'O', 'entity', 'entity', 'O','O','O','O'],
    ['O','O','entity','O','O','O','O','entity','entity','O','O','O','O','O','O','O','O','O','entity','entity','entity','entity','O','O','O','O','O','O','O','O','O','O','entity']
] + label_dataset




# Check and align lengths of texts and labels
for i in range(len(texts)):
    text_length = len(texts[i].split())
    label_length = len(labels[i])
    if text_length != label_length:
        raise ValueError(f"Text and label lengths do not match at index {i}: {text_length} != {label_length}")

vocab, tag_to_idx = prepare_data(texts, labels)
dataset = NERDataset(texts, labels, vocab, tag_to_idx)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

model = BiLSTM_CRF(len(vocab), tag_to_idx, embedding_dim=300, hidden_dim=600)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

num_epochs = 3000
for epoch in range(num_epochs):
    total_loss = 0
    for inputs, targets in dataloader:
        inputs = inputs.to(device)
        targets = targets.to(device)
        optimizer.zero_grad()
        loss = model(inputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader):.4f}")

NameError: name 'text_dataset' is not defined

In [19]:
import pickle
model = pickle.load(open("entity_cikarici.ai","rb"))

In [20]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torchcrf import CRF

devide = torch.device("cuda")

In [21]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torchcrf import CRF

device = torch.device("cuda")

class NERDataset(Dataset):
    def __init__(self, texts, labels, vocab, tag_to_idx):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.tag_to_idx = tag_to_idx

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = [self.vocab.get(word, self.vocab['<UNK>']) for word in self.texts[idx].split()]
        labels = [self.tag_to_idx[tag] for tag in self.labels[idx]]
        return torch.tensor(text, dtype=torch.long), torch.tensor(labels, dtype=torch.long)

class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tag_to_idx, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_idx = tag_to_idx
        self.tagset_size = len(tag_to_idx)

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=2, bidirectional=True, dropout=0.5)
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
        self.crf = CRF(self.tagset_size, batch_first=True)

    def forward(self, sentence, tags=None):
        embedded = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embedded)
        tag_scores = self.hidden2tag(lstm_out)

        if tags is not None:
            mask = (sentence != 0).bool()
            loss = -self.crf(tag_scores, tags, mask=mask)
            return loss
        else:
            best_tags = self.crf.decode(tag_scores)
            return best_tags

def prepare_data(texts, labels):
    vocab = {'<PAD>': 0, '<UNK>': 1}
    tag_to_idx = {'O': 0, 'entity': 1}

    for text in texts:
        for word in text.split():
            if word not in vocab:
                vocab[word] = len(vocab)

    return vocab, tag_to_idx

def collate_fn(batch):
    texts, labels = zip(*batch)
    max_len = max(len(x) for x in texts)
    padded_texts = [torch.cat([x, torch.zeros(max_len - len(x), dtype=torch.long)]) for x in texts]
    padded_labels = [torch.cat([y, torch.zeros(max_len - len(y), dtype=torch.long)]) for y in labels]
    return torch.stack(padded_texts), torch.stack(padded_labels)

texts = [
    "Ahmet Yılmaz 15 Mart 1990 tarihinde doğdu ve şu an 33 yaşında",
    "istanbul'un nüfusu yaklaşık 15 milyon kisidir",
    "Mustafa Kemal Atatürk 29 Ekim 1923'te Türkiye Cumhuriyeti'ni kurdu",
    "Entity X'in müşteri hizmetleri hızlı ve etkili Entity Y'nin ürün kalitesi çok kötü",
    "Fiber 100mb SuperOnline kullanıcısıyım yaklaşık 2 haftadır @Twitch @Kick_turkey gibi canlı yayın platformlarında 360p yayın izlerken donmalar yaşıyoruz başka hiç bir operatörler bu sorunu yaşamazken ben parasını verip alamadığım hizmeti neden ödeyeyim @Turkcell"
] + text_dataset

labels = [
    ['entity', 'entity', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
    ['entity', 'O', 'O', 'O', 'O','O'],
    ['entity', 'entity', 'entity', 'O', 'O', 'O', 'O', 'O', 'O'],
    ['entity', 'entity', 'O', 'O', 'O', 'O', 'O', 'entity', 'entity', 'O','O','O','O'],
    ['O','O','entity','O','O','O','O','entity','entity','O','O','O','O','O','O','O','O','O','entity','entity','entity','entity','O','O','O','O','O','O','O','O','O','O','entity']
] + label_dataset




# Check and align lengths of texts and labels
for i in range(len(texts)):
    text_length = len(texts[i].split())
    label_length = len(labels[i])
    if text_length != label_length:
        raise ValueError(f"Text and label lengths do not match at index {i}: {text_length} != {label_length}")

vocab, tag_to_idx = prepare_data(texts, labels)
dataset = NERDataset(texts, labels, vocab, tag_to_idx)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

model = BiLSTM_CRF(len(vocab), tag_to_idx, embedding_dim=300, hidden_dim=600)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

num_epochs = 3000
for epoch in range(num_epochs):
    total_loss = 0
    for inputs, targets in dataloader:
        inputs = inputs.to(device)
        targets = targets.to(device)
        optimizer.zero_grad()
        loss = model(inputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader):.4f}")



test_sentence = "benim şirketim kazanacaktır , onun adı da turkcell"
test_tensor = torch.tensor([vocab.get(word, vocab['<UNK>']) for word in test_sentence.split()], dtype=torch.long)
test_tensor = test_tensor.to(device)
with torch.no_grad():
    best_tags = model(test_tensor.unsqueeze(0))[0]

best_tags
idx_to_tag = {i: tag for tag, i in tag_to_idx.items()}
predicted_labels = [idx_to_tag[i] for i in best_tags]
print("Test cümlesi:", test_sentence)
print("Tahmin edilen etiketler:", predicted_labels)

NameError: name 'label_dataset' is not defined

In [22]:
test_sentence = "benim şirketim kazanacaktır , onun adı da turkcell"
test_tensor = torch.tensor([vocab.get(word, vocab['<UNK>']) for word in test_sentence.split()], dtype=torch.long)
test_tensor = test_tensor.to(device)
with torch.no_grad():
    best_tags = model(test_tensor.unsqueeze(0))[0]

best_tags
idx_to_tag = {i: tag for tag, i in tag_to_idx.items()}
predicted_labels = [idx_to_tag[i] for i in best_tags]
print("Test cümlesi:", test_sentence)
print("Tahmin edilen etiketler:", predicted_labels)

NameError: name 'vocab' is not defined