In [None]:
#!pip install datasets
from datasets import load_dataset, load_from_disk

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torchcrf import CRF

devide = torch.device("cuda")

In [1]:
from datasets import load_dataset, load_from_disk


# Later, load the dataset from disk
dataset = load_dataset("wikiann", "tr")

# Verify that the dataset is loaded correctly
print(dataset)


DatasetDict({
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 20000
    })
})


In [11]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torchcrf import CRF



class NERDataset(Dataset):
    def __init__(self, texts, labels, vocab, tag_to_idx):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.tag_to_idx = tag_to_idx

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = [self.vocab.get(word, self.vocab['<UNK>']) for word in self.texts[idx].split()]
        labels = [self.tag_to_idx[tag] for tag in self.labels[idx]]
        return torch.tensor(text, dtype=torch.long), torch.tensor(labels, dtype=torch.long)

class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tag_to_idx, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_idx = tag_to_idx
        self.tagset_size = len(tag_to_idx)

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=2, bidirectional=True, dropout=0.5)
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
        self.crf = CRF(self.tagset_size, batch_first=True)

    def forward(self, sentence, tags=None):
        embedded = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embedded)
        tag_scores = self.hidden2tag(lstm_out)

        if tags is not None:
            mask = (sentence != 0).bool()
            loss = -self.crf(tag_scores, tags, mask=mask)
            return loss
        else:
            best_tags = self.crf.decode(tag_scores)
            return best_tags

def prepare_data(texts, labels):
    vocab = {'<PAD>': 0, '<UNK>': 1}
    tag_to_idx = {'O': 0, 'entity': 1}

    for text in texts:
        for word in text.split():
            if word not in vocab:
                vocab[word] = len(vocab)

    return vocab, tag_to_idx

def collate_fn(batch):
    texts, labels = zip(*batch)
    max_len = max(len(x) for x in texts)
    padded_texts = [torch.cat([x, torch.zeros(max_len - len(x), dtype=torch.long)]) for x in texts]
    padded_labels = [torch.cat([y, torch.zeros(max_len - len(y), dtype=torch.long)]) for y in labels]
    return torch.stack(padded_texts), torch.stack(padded_labels)

texts = [
    "Ahmet Yılmaz 15 Mart 1990 tarihinde doğdu ve şu an 33 yaşında",
    "istanbul'un nüfusu yaklaşık 15 milyon kisidir",
    "Mustafa Kemal Atatürk 29 Ekim 1923'te Türkiye Cumhuriyeti'ni kurdu",
    "Entity X'in müşteri hizmetleri hızlı ve etkili Entity Y'nin ürün kalitesi çok kötü",
    "Fiber 100mb SuperOnline kullanıcısıyım yaklaşık 2 haftadır @Twitch @Kick_turkey gibi canlı yayın platformlarında 360p yayın izlerken donmalar yaşıyoruz başka hiç bir operatörler bu sorunu yaşamazken ben parasını verip alamadığım hizmeti neden ödeyeyim @Turkcell"
] + text_dataset

labels = [
    ['entity', 'entity', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
    ['entity', 'O', 'O', 'O', 'O','O'],
    ['entity', 'entity', 'entity', 'O', 'O', 'O', 'O', 'O', 'O'],
    ['entity', 'entity', 'O', 'O', 'O', 'O', 'O', 'entity', 'entity', 'O','O','O','O'],
    ['O','O','entity','O','O','O','O','entity','entity','O','O','O','O','O','O','O','O','O','entity','entity','entity','entity','O','O','O','O','O','O','O','O','O','O','entity']
] + label_dataset




# Check and align lengths of texts and labels
for i in range(len(texts)):
    text_length = len(texts[i].split())
    label_length = len(labels[i])
    if text_length != label_length:
        raise ValueError(f"Text and label lengths do not match at index {i}: {text_length} != {label_length}")

vocab, tag_to_idx = prepare_data(texts, labels)
dataset = NERDataset(texts, labels, vocab, tag_to_idx)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

model = BiLSTM_CRF(len(vocab), tag_to_idx, embedding_dim=100, hidden_dim=200)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

num_epochs = 300
for epoch in range(num_epochs):
    total_loss = 0
    for inputs, targets in dataloader:
        inputs = inputs.to(device)
        targets = targets.to(device)
        optimizer.zero_grad()
        loss = model(inputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader):.4f}")

KeyboardInterrupt: 

In [3]:
import pandas as pd

In [4]:
ner_encoding = {0: "O", 1: "entity", 2: "entity", 3: "entity", 4: "entity", 5: "entity", 6: "entity"}


train_tokens = []
train_tags = []
for sample in dataset["train"]:
  train_tokens.append(' '.join(sample["tokens"]))
  train_tags.append(' '.join([ner_encoding[a] for a in sample["ner_tags"]]))

test_tokens = []
test_tags = []
for sample in dataset["train"]:
  test_tokens.append(' '.join(sample["tokens"]))
  test_tags.append(' '.join([ner_encoding[a] for a in sample["ner_tags"]]))

df_train = pd.DataFrame({"sentence": train_tokens, "tags": train_tags})
df_test = pd.DataFrame({"sentence": test_tokens, "tags": test_tags})

In [5]:
df_train.head()

Unnamed: 0,sentence,tags
0,3.lük maçında Slovenya Millî Basketbol Takımı'...,O O entity entity entity entity O O O O O O O ...
1,' '' Denizlispor '' ',O O entity O O
2,"Hami Mandıralı 36 , Orhan Çıkırıkçı 46 , 48 , ...",entity entity O O entity entity O O O O entity...
3,San Antonio Spurs ( Milwaukee'den ),entity entity entity O O O
4,Divandere ( Dîwandere ),entity O O O


In [6]:
ner_encoding = {0: "O", 1: "entity", 2: "entity", 3: "entity", 4: "entity", 5: "entity", 6: "entity"}

text_dataset = []
for sample in dataset["train"]:
  text_dataset.append(' '.join(sample["tokens"]))
for sample in dataset["test"]:
  text_dataset.append(' '.join(sample["tokens"]))

label_dataset = []
for sample in dataset["train"]:
  label_dataset.append([ner_encoding[a] for a in sample["ner_tags"]])
for sample in dataset["test"]:
  label_dataset.append([ner_encoding[a] for a in sample["ner_tags"]])

In [None]:
from TorchCRF import CRF

In [9]:
import torch
device = torch.device("cuda")

In [10]:
test_sentence = "benim şirketim kazanacaktır , onun adı da turkcell"
test_tensor = torch.tensor([vocab.get(word, vocab['<UNK>']) for word in test_sentence.split()], dtype=torch.long)
test_tensor = test_tensor.to(device)
with torch.no_grad():
    best_tags = model(test_tensor.unsqueeze(0))[0]

best_tags
idx_to_tag = {i: tag for tag, i in tag_to_idx.items()}
predicted_labels = [idx_to_tag[i] for i in best_tags]
print("Test cümlesi:", test_sentence)
print("Tahmin edilen etiketler:", predicted_labels)

NameError: name 'vocab' is not defined

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torchcrf import CRF

device = torch.device("cuda")

class NERDataset(Dataset):
    def __init__(self, texts, labels, vocab, tag_to_idx):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.tag_to_idx = tag_to_idx

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = [self.vocab.get(word, self.vocab['<UNK>']) for word in self.texts[idx].split()]
        labels = [self.tag_to_idx[tag] for tag in self.labels[idx]]
        return torch.tensor(text, dtype=torch.long), torch.tensor(labels, dtype=torch.long)

class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tag_to_idx, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_idx = tag_to_idx
        self.tagset_size = len(tag_to_idx)

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=2, bidirectional=True, dropout=0.5)
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
        self.crf = CRF(self.tagset_size, batch_first=True)

    def forward(self, sentence, tags=None):
        embedded = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embedded)
        tag_scores = self.hidden2tag(lstm_out)

        if tags is not None:
            mask = (sentence != 0).bool()
            loss = -self.crf(tag_scores, tags, mask=mask)
            return loss
        else:
            best_tags = self.crf.decode(tag_scores)
            return best_tags

def prepare_data(texts, labels):
    vocab = {'<PAD>': 0, '<UNK>': 1}
    tag_to_idx = {'O': 0, 'entity': 1}

    for text in texts:
        for word in text.split():
            if word not in vocab:
                vocab[word] = len(vocab)

    return vocab, tag_to_idx

def collate_fn(batch):
    texts, labels = zip(*batch)
    max_len = max(len(x) for x in texts)
    padded_texts = [torch.cat([x, torch.zeros(max_len - len(x), dtype=torch.long)]) for x in texts]
    padded_labels = [torch.cat([y, torch.zeros(max_len - len(y), dtype=torch.long)]) for y in labels]
    return torch.stack(padded_texts), torch.stack(padded_labels)

texts = [
    "Ahmet Yılmaz 15 Mart 1990 tarihinde doğdu ve şu an 33 yaşında",
    "istanbul'un nüfusu yaklaşık 15 milyon kisidir",
    "Mustafa Kemal Atatürk 29 Ekim 1923'te Türkiye Cumhuriyeti'ni kurdu",
    "Entity X'in müşteri hizmetleri hızlı ve etkili Entity Y'nin ürün kalitesi çok kötü",
    "Fiber 100mb SuperOnline kullanıcısıyım yaklaşık 2 haftadır @Twitch @Kick_turkey gibi canlı yayın platformlarında 360p yayın izlerken donmalar yaşıyoruz başka hiç bir operatörler bu sorunu yaşamazken ben parasını verip alamadığım hizmeti neden ödeyeyim @Turkcell"
] + text_dataset

labels = [
    ['entity', 'entity', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
    ['entity', 'O', 'O', 'O', 'O','O'],
    ['entity', 'entity', 'entity', 'O', 'O', 'O', 'O', 'O', 'O'],
    ['entity', 'entity', 'O', 'O', 'O', 'O', 'O', 'entity', 'entity', 'O','O','O','O'],
    ['O','O','entity','O','O','O','O','entity','entity','O','O','O','O','O','O','O','O','O','entity','entity','entity','entity','O','O','O','O','O','O','O','O','O','O','entity']
] + label_dataset






In [None]:
ner_encoding = {0: "O", 1: "entity", 2: "entity", 3: "entity", 4: "entity", 5: "entity", 6: "entity"}

text_dataset = []
for sample in dataset["train"]:
  text_dataset.append(' '.join(sample["tokens"]))
for sample in dataset["test"]:
  text_dataset.append(' '.join(sample["tokens"]))

label_dataset = []
for sample in dataset["train"]:
  label_dataset.append([ner_encoding[a] for a in sample["ner_tags"]])
for sample in dataset["test"]:
  label_dataset.append([ner_encoding[a] for a in sample["ner_tags"]])

In [None]:
from datasets import load_dataset, load_from_disk


# Later, load the dataset from disk
dataset = load_dataset("wikiann", "tr")

# Verify that the dataset is loaded correctly
print(dataset)


In [None]:
ner_encoding = {0: "O", 1: "entity", 2: "entity", 3: "entity", 4: "entity", 5: "entity", 6: "entity"}

text_dataset = []
for sample in dataset["train"]:
  text_dataset.append(' '.join(sample["tokens"]))
for sample in dataset["test"]:
  text_dataset.append(' '.join(sample["tokens"]))

label_dataset = []
for sample in dataset["train"]:
  label_dataset.append([ner_encoding[a] for a in sample["ner_tags"]])
for sample in dataset["test"]:
  label_dataset.append([ner_encoding[a] for a in sample["ner_tags"]])

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torchcrf import CRF

device = torch.device("cuda")

class NERDataset(Dataset):
    def __init__(self, texts, labels, vocab, tag_to_idx):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.tag_to_idx = tag_to_idx

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = [self.vocab.get(word, self.vocab['<UNK>']) for word in self.texts[idx].split()]
        labels = [self.tag_to_idx[tag] for tag in self.labels[idx]]
        return torch.tensor(text, dtype=torch.long), torch.tensor(labels, dtype=torch.long)

class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tag_to_idx, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_idx = tag_to_idx
        self.tagset_size = len(tag_to_idx)

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=2, bidirectional=True, dropout=0.5)
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
        self.crf = CRF(self.tagset_size, batch_first=True)

    def forward(self, sentence, tags=None):
        embedded = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embedded)
        tag_scores = self.hidden2tag(lstm_out)

        if tags is not None:
            mask = (sentence != 0).bool()
            loss = -self.crf(tag_scores, tags, mask=mask)
            return loss
        else:
            best_tags = self.crf.decode(tag_scores)
            return best_tags

def prepare_data(texts, labels):
    vocab = {'<PAD>': 0, '<UNK>': 1}
    tag_to_idx = {'O': 0, 'entity': 1}

    for text in texts:
        for word in text.split():
            if word not in vocab:
                vocab[word] = len(vocab)

    return vocab, tag_to_idx

def collate_fn(batch):
    texts, labels = zip(*batch)
    max_len = max(len(x) for x in texts)
    padded_texts = [torch.cat([x, torch.zeros(max_len - len(x), dtype=torch.long)]) for x in texts]
    padded_labels = [torch.cat([y, torch.zeros(max_len - len(y), dtype=torch.long)]) for y in labels]
    return torch.stack(padded_texts), torch.stack(padded_labels)

texts = [
    "Ahmet Yılmaz 15 Mart 1990 tarihinde doğdu ve şu an 33 yaşında",
    "istanbul'un nüfusu yaklaşık 15 milyon kisidir",
    "Mustafa Kemal Atatürk 29 Ekim 1923'te Türkiye Cumhuriyeti'ni kurdu",
    "Entity X'in müşteri hizmetleri hızlı ve etkili Entity Y'nin ürün kalitesi çok kötü",
    "Fiber 100mb SuperOnline kullanıcısıyım yaklaşık 2 haftadır @Twitch @Kick_turkey gibi canlı yayın platformlarında 360p yayın izlerken donmalar yaşıyoruz başka hiç bir operatörler bu sorunu yaşamazken ben parasını verip alamadığım hizmeti neden ödeyeyim @Turkcell"
] + text_dataset

labels = [
    ['entity', 'entity', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
    ['entity', 'O', 'O', 'O', 'O','O'],
    ['entity', 'entity', 'entity', 'O', 'O', 'O', 'O', 'O', 'O'],
    ['entity', 'entity', 'O', 'O', 'O', 'O', 'O', 'entity', 'entity', 'O','O','O','O'],
    ['O','O','entity','O','O','O','O','entity','entity','O','O','O','O','O','O','O','O','O','entity','entity','entity','entity','O','O','O','O','O','O','O','O','O','O','entity']
] + label_dataset




# Check and align lengths of texts and labels
for i in range(len(texts)):
    text_length = len(texts[i].split())
    label_length = len(labels[i])
    if text_length != label_length:
        raise ValueError(f"Text and label lengths do not match at index {i}: {text_length} != {label_length}")

vocab, tag_to_idx = prepare_data(texts, labels)
dataset = NERDataset(texts, labels, vocab, tag_to_idx)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

model = BiLSTM_CRF(len(vocab), tag_to_idx, embedding_dim=300, hidden_dim=600)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

num_epochs = 3000
for epoch in range(num_epochs):
    total_loss = 0
    for inputs, targets in dataloader:
        inputs = inputs.to(device)
        targets = targets.to(device)
        optimizer.zero_grad()
        loss = model(inputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader):.4f}")

In [None]:
import pickle
model = pickle.load(open("entity_cikarici.ai","rb"))

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torchcrf import CRF

devide = torch.device("cuda")

In [None]:
test_sentence = "turkcell çok iyi bir hizmet ancak vodafone kotu"
test_tensor = torch.tensor([vocab.get(word, vocab['<UNK>']) for word in test_sentence.split()], dtype=torch.long)
test_tensor = test_tensor.to(device)
with torch.no_grad():
    best_tags = model(test_tensor.unsqueeze(0))[0]

idx_to_tag = {i: tag for tag, i in tag_to_idx.items()}
predicted_labels = [idx_to_tag[i] for i in best_tags]
print("Test cümlesi:", test_sentence)
print("Tahmin edilen etiketler:", predicted_labels)

In [None]:
Merhaba, ben Kerem Ali. İstanbul Teknik Üniversitesi Mesleki ve Teknik Anadolu Lisesi olarak katılıyoruz.
Danışman hocam ile buraya geldim. Okulumun yapay zeka takımının kaptanı olarak katılıyorum. 
Sloganımız: Sorunlar bizim için şikayet değil proje konusudur.

In [None]:
import torch

In [None]:
test_sentence = "türkcell çok iyi hizmet vodafone kötü"
test_tensor = torch.tensor([vocab.get(word, vocab['<UNK>']) for word in test_sentence.split()], dtype=torch.long).to(device)
with torch.no_grad():
    best_tags = model2(test_tensor.unsqueeze(0))[0]

idx_to_tag = {i: tag for tag, i in tag_to_idx.items()}
predicted_labels = [idx_to_tag[i] for i in best_tags]
print("Test cümlesi:", test_sentence)
print("Tahmin edilen etiketler:", predicted_labels)

In [None]:
len(texts)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torchcrf import CRF

class NERDataset(Dataset):
    def __init__(self, texts, labels, vocab, tag_to_idx):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.tag_to_idx = tag_to_idx

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = [self.vocab.get(word, self.vocab['<UNK>']) for word in self.texts[idx].split()]
        labels = [self.tag_to_idx[tag] for tag in self.labels[idx]]
        return torch.tensor(text, dtype=torch.long), torch.tensor(labels, dtype=torch.long)

class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tag_to_idx, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_idx = tag_to_idx
        self.tagset_size = len(tag_to_idx)

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=2, bidirectional=True, dropout=0.5)
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
        self.crf = CRF(self.tagset_size, batch_first=True)

    def forward(self, sentence, tags=None):
        embedded = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embedded)
        tag_scores = self.hidden2tag(lstm_out)

        if tags is not None:
            mask = (sentence != 0).bool()
            loss = -self.crf(tag_scores, tags, mask=mask)
            return loss
        else:
            best_tags = self.crf.decode(tag_scores)
            return best_tags

def prepare_data(texts, labels):
    vocab = {'<PAD>': 0, '<UNK>': 1}
    tag_to_idx = {'O': 0, 'B-SAYI': 1, 'I-SAYI': 2, 'B-TARIH': 3, 'I-TARIH': 4, 'B-KISI': 5, 'I-KISI': 6}

    for text in texts:
        for word in text.split():
            if word not in vocab:
                vocab[word] = len(vocab)

    return vocab, tag_to_idx

def collate_fn(batch):
    texts, labels = zip(*batch)
    max_len = max(len(x) for x in texts)
    padded_texts = [torch.cat([x, torch.zeros(max_len - len(x), dtype=torch.long)]) for x in texts]
    padded_labels = [torch.cat([y, torch.zeros(max_len - len(y), dtype=torch.long)]) for y in labels]
    return torch.stack(padded_texts), torch.stack(padded_labels)

texts = [
    "Ahmet Yılmaz 15 Mart 1990 tarihinde doğdu ve şu an 33 yaşında.",
    "İstanbulun nüfusu yaklaşık 15 milyon kişidir.",
    "Mustafa Kemal Atatürk 29 Ekim 1923'te Türkiye Cumhuriyeti'ni kurdu.",
    "Everest Dağının yüksekliği 8848 metredir.",
    "2023 yılında Türkiye'nin nüfusu 84,6 milyon olarak tahmin ediliyor."
]

labels = [
    ['B-KISI', 'I-KISI', 'B-SAYI', 'I-SAYI', 'I-SAYI', 'O', 'O', 'O', 'O', 'B-SAYI', 'O', 'O'],  # Adjusted for length
    ['O', 'O', 'O', 'B-SAYI', 'O', 'O'],  # Adjusted for length
    ['B-KISI', 'I-KISI', 'I-KISI', 'B-TARIH', 'I-TARIH', 'I-TARIH', 'O', 'O', 'O'],  # Already correct
    ['O', 'O', 'O', 'B-SAYI', 'O'],  # Adjusted for length
    ['B-TARIH', 'O', 'O', 'O', 'B-SAYI', 'I-SAYI', 'O', 'O', 'O']  # Already correct
]


import random
from datetime import datetime, timedelta

def generate_random_date():
    start_date = datetime(1900, 1, 1)
    end_date = datetime(2023, 12, 31)
    time_between_dates = end_date - start_date
    days_between_dates = time_between_dates.days
    random_number_of_days = random.randrange(days_between_dates)
    return start_date + timedelta(days=random_number_of_days)

def generate_random_name():
    first_names = ["Ahmet", "Mehmet", "Ali", "Ayşe", "Fatma", "Mustafa", "Emine", "Hüseyin", "Hatice", "İbrahim"]
    last_names = ["Yılmaz", "Kaya", "Demir", "Çelik", "Şahin", "Yıldız", "Öztürk", "Aydın", "Özdemir", "Arslan"]
    return f"{random.choice(first_names)} {random.choice(last_names)}"

def generate_random_text():
    templates = [
        "{} {} tarihinde doğdu ve şu an {} yaşında.",
        "{}nin nüfusu yaklaşık {} milyon kişidir.",
        "{} {} tarihinde {} kurdu.",
        "{} Dağının yüksekliği {} metredir.",
        "{} yılında Türkiye'nin nüfusu {} milyon olarak tahmin ediliyor."
    ]

    template = random.choice(templates)

    if template == templates[0]:
        name = generate_random_name()
        birth_date = generate_random_date()
        age = datetime.now().year - birth_date.year
        return template.format(name, birth_date.strftime("%d %B %Y"), age)
    elif template == templates[1]:
        cities = ["İstanbul", "Ankara", "İzmir", "Bursa", "Antalya"]
        return template.format(random.choice(cities), random.randint(1, 20))
    elif template == templates[2]:
        events = ["Türkiye Cumhuriyeti'ni", "TBMM'yi", "Türk Dil Kurumu'nu"]
        return template.format(generate_random_name(), generate_random_date().strftime("%d %B %Y"), random.choice(events))
    elif template == templates[3]:
        mountains = ["Everest", "K2", "Kangchenjunga", "Lhotse", "Makalu"]
        return template.format(random.choice(mountains), random.randint(5000, 9000))
    else:
        return template.format(random.randint(2023, 2050), random.uniform(80, 100))

def generate_labels(text):
    words = text.split()
    labels = ['O'] * len(words)

    for i, word in enumerate(words):
        if word[0].isupper() and i > 0 and words[i-1][0].isupper():
            labels[i-1] = 'B-KISI'
            labels[i] = 'I-KISI'
        elif word.isdigit():
            if i > 0 and words[i-1] in ["tarihinde", "yılında"]:
                labels[i] = 'B-TARIH'
            else:
                labels[i] = 'B-SAYI'
        elif word in ["Ocak", "Şubat", "Mart", "Nisan", "Mayıs", "Haziran", "Temmuz", "Ağustos", "Eylül", "Ekim", "Kasım", "Aralık"]:
            labels[i] = 'I-TARIH'

    return labels

texts = []
labels = []

for _ in range(100):
    text = generate_random_text()
    texts.append(text)
    labels.append(generate_labels(text))

# Print the results
for i in range(100):
    print(f"Text: {texts[i]}")
    print(f"Labels: {labels[i]}")
    print()





# Check and align lengths of texts and labels
for i in range(len(texts)):
    text_length = len(texts[i].split())
    label_length = len(labels[i])
    if text_length != label_length:
        raise ValueError(f"Text and label lengths do not match at index {i}: {text_length} != {label_length}")

vocab, tag_to_idx = prepare_data(texts, labels)
dataset = NERDataset(texts, labels, vocab, tag_to_idx)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

model = BiLSTM_CRF(len(vocab), tag_to_idx, embedding_dim=100, hidden_dim=200)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

num_epochs = 50
for epoch in range(num_epochs):
    total_loss = 0
    for inputs, targets in dataloader:
        optimizer.zero_grad()
        loss = model(inputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader):.4f}")

test_sentence = "Ayşe Kaya 10 Nisan 2000'de 25 yaşına girecek."
test_tensor = torch.tensor([vocab.get(word, vocab['<UNK>']) for word in test_sentence.split()], dtype=torch.long)
with torch.no_grad():
    best_tags = model(test_tensor.unsqueeze(0))[0]

idx_to_tag = {i: tag for tag, i in tag_to_idx.items()}
predicted_labels = [idx_to_tag[i] for i in best_tags]
print("Test cümlesi:", test_sentence)
print("Tahmin edilen etiketler:", predicted_labels)
