In [1]:
from __init__ import *
from dataset import IdiomDataset
from collate import collate
from model import IdiomExtractor
from bert_embedder import BERTEmbedder
from hparams import HParams
from trainer import Trainer
from utils import *
import torch.nn.functional as F


SEED = 2
# set seeds to get reproducible results
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
# gpuda bazen randomluk olabiliyormuş onu kaldırmak için
torch.backends.cudnn.deterministic = True

# create bert
it_model_name = 'bert-base-multilingual-cased'
# output hidden states -> it helps to get hidden states from bert
it_config = BertConfig.from_pretrained(it_model_name, output_hidden_states=True)
it_tokenizer = BertTokenizer.from_pretrained(it_model_name)
# get bert weights
hf_it_model = BertModel.from_pretrained(it_model_name, config=it_config)


# Türkçe BERT
tr_model_name = "dbmdz/bert-base-turkish-128k-cased"
tr_config = BertConfig.from_pretrained(tr_model_name, output_hidden_states=True)
tr_tokenizer = BertTokenizer.from_pretrained(tr_model_name)
hf_tr_model = BertModel.from_pretrained(tr_model_name, config=tr_config)

# train, update or test mode selection
#mode = input("Do you want to train or test the model? (train, update, test): ").strip().lower()
mode = "train"
assert mode in ['train', 'update', 'test'], "Mode must be one of train, update, test"
# select the dataset
#dataset_selection = input("Select the dataset (ID10M, ITU, PARSEME, ALL_COMBINED): ").strip().upper()
dataset_selection = "ITU"
assert dataset_selection in ['ID10M', 'ITU', 'PARSEME', 'COMBINED'], "Dataset must be one of ID10M, ITU, PARSEME, COMBINED"

# check dataset path
tr_path = r"./src/checkpoints/tr/"
it_path = r"./src/checkpoints/it/"
os.makedirs(tr_path, exist_ok=True)
os.makedirs(it_path, exist_ok=True)

if mode in ["test","update"]:
    # list available checkpoints
    print("Available tr checkpoints:")
    checkpoints = os.listdir(tr_path)
    for i, checkpoint in enumerate(checkpoints):
        print(f"{i+1}. {checkpoint}")
    print("none")
    # load the model
    checkpoint = input("Enter the checkpoint (without .pt): ").strip()
    if checkpoint == "none":
        tr_path = None
    else:
        tr_path = tr_path + checkpoint + ".pt"
        assert os.path.exists(tr_path), "Model path does not exist"

    print("\n")

    print("Available it checkpoints:")
    checkpoints = os.listdir(it_path)
    for i, checkpoint in enumerate(checkpoints):
        print(f"{i+1}. {checkpoint}")
    print("none")
    # load the model
    checkpoint = input("Enter the checkpoint (without .pt): ").strip()
    if checkpoint == "none":
        it_path = None
    else:
        it_path = it_path + checkpoint + ".pt"
        assert os.path.exists(it_path), "Model path does not exist"

model_name = None
if mode in ["train", "update"]:
    #model_name = input("Enter the model name (without .pt): ").strip()
    model_name = "deneme"

elif mode == "test":
    model_name = checkpoint

# get stanza tagger for both languages
tagger_dict = initialize(use_gpu=True)

# get the path for the dataset
main_path = r"../resources/"+dataset_selection+"/"
train_file = main_path + "train.tsv"
dev_file = main_path + "dev.tsv"
test_file = main_path + "test.tsv"

labels_vocab = {"<pad>":0, "B-IDIOM":1, "I-IDIOM":2, "O":3}

# initialize the dataset
train_dataset, dev_dataset, test_dataset = None, None, None
if mode in ["train", "update"]:
    train_dataset = IdiomDataset(train_file, labels_vocab, tagger_dict)
    dev_dataset = IdiomDataset(dev_file, labels_vocab, tagger_dict)
    print(f"train sentences: {len(train_dataset)}")
    print(f"dev sentences: {len(dev_dataset)}")
    print("-" * 50 + "\n")
else:
    test_dataset = IdiomDataset(test_file, labels_vocab, tagger_dict) 
    print(f"test sentences: {len(test_dataset)}")
    print("-" * 50 + "\n")

#dataloader

if mode in ["train", "update"]:
    train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate)
    dev_dataloader = DataLoader(dev_dataset, batch_size=16, collate_fn=collate)
    print(f"length of train dataloader: {len(train_dataloader)}")
    print(f"length of dev dataloader: {len(dev_dataloader)}")
else:
    test_dataloader = DataLoader(test_dataset, batch_size=1, collate_fn=collate)
    print(f"length of test dataloader: {len(test_dataloader)}")


#instantiate the hyperparameters
params = HParams()

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#instantiate the model
it_model = IdiomExtractor(hf_it_model,
                    params).cuda()

it_model.freeze_bert()

tr_model = IdiomExtractor(hf_tr_model,
                    params).cuda()

tr_model.freeze_bert()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

it_embedder =  BERTEmbedder(hf_it_model, it_tokenizer, device)
tr_embedder =  BERTEmbedder(hf_tr_model, tr_tokenizer, device)


  from .autonotebook import tqdm as notebook_tqdm
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


KeyboardInterrupt: 

In [None]:
example_sentence = next(iter(train_dataloader))

In [None]:
words, labels, langs = example_sentence

# embedding'i aynı size a getirmek için lazım
# labellar beraber padleniyor embeddingler ayrı
# yani en uzun tr cümle 14, it cümle 12 uzunluğundaysa
# it label 14, it embedding 12 boyutunda oluyor hata veriyor.
global_max = labels.size(1)

print(f"shape of words: {len(words)}, example: {words[0]}")
print(f"shape of labels: {len(labels)}, example: {labels[0]}")
print(f"shape of langs: {len(langs)}, example: {langs}")
print(global_max)

shape of words: 16, example: ['Zaman', 'kazanmak', 'için', 'yaptığın', 'entrikalar', 'seni', 'kurtarmayacak', ',', 'eninde', 'sonunda', 'yakalayacak', 'seni', 'polis', '!']
shape of labels: 16, example: tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
shape of langs: 16, example: tensor([0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1], device='cuda:0')
14


In [None]:
tr_indices = (langs == 0).nonzero(as_tuple=True)[0]
it_indices = (langs == 1).nonzero(as_tuple=True)[0]
print(f"tr_indices: {tr_indices}")
print(f"it_indices: {it_indices}")

tr_indices: tensor([ 0,  1,  5,  7, 10, 11, 12], device='cuda:0')
it_indices: tensor([ 2,  3,  4,  6,  8,  9, 13, 14, 15], device='cuda:0')


In [None]:
print(f"labels: {labels}")

labels: tensor([[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
        [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0],
        [1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0],
        [3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0],
        [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0],
        [2, 3, 1, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [3, 1, 2, 2, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0],
        [3, 3, 3, 3, 3, 1, 2, 3, 0, 0, 0, 0, 0, 0],
        [3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0],
        [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0],
        [3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0],
        [3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0],
        [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0],
        [3, 3, 3, 1, 3, 2, 2, 2, 0, 0, 0, 0, 0, 0],
        [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0],
        [3, 3, 3, 3, 3, 3, 3, 3, 1, 2, 2, 0, 0, 0]], device='cuda:0')


In [None]:
tr_words = [words[i] for i in tr_indices.cpu().numpy()]
tr_labels = labels[tr_indices] # 0 1 5 7 10 11 12. labellar tr dilinin labelları

print(f"length of tr_words: {len(tr_words)}, example: {tr_words[0]}")
print(f"shape of tr_labels: {tr_labels.shape}, example: {tr_labels}")

length of tr_words: 7, example: ['Zaman', 'kazanmak', 'için', 'yaptığın', 'entrikalar', 'seni', 'kurtarmayacak', ',', 'eninde', 'sonunda', 'yakalayacak', 'seni', 'polis', '!']
shape of tr_labels: torch.Size([7, 14]), example: tensor([[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
        [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0],
        [2, 3, 1, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [3, 3, 3, 3, 3, 1, 2, 3, 0, 0, 0, 0, 0, 0],
        [3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0],
        [3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0],
        [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0]], device='cuda:0')


In [None]:
# embed tr list of embeddings herbirisi 10,seq_length,768 -> [tr_batch_size, seq_length, 768]
tr_embedded = tr_embedder.embed_sentences(tr_words)
print(f"shape of embedded_tr: {len(tr_embedded)}, example: {tr_embedded[0].shape}")
print(f"shape of embedded_tr: {len(tr_embedded)}, example: {tr_embedded[1].shape}")

shape of embedded_tr: 7, example: torch.Size([14, 768])
shape of embedded_tr: 7, example: torch.Size([10, 768])


In [None]:
# tr embedded'i -1le padleyerek seq_lengthi eşitliyoruz. -> [tr_batch_size, max_seq_length, 768]
tr_embs = pad_sequence(tr_embedded, batch_first=True, padding_value=0).to(device)

# check embedding size to match the labels
if tr_embs.size(1) < global_max:
    # ekstra pad lazım
    pad_size = global_max - tr_embs.size(1)

    tr_embs = F.pad(tr_embs, (0, 0, 0, pad_size), "constant", 0)


print(f"shape of tr_embs: {tr_embs.shape}, example: {tr_embs[0]}")

shape of tr_embs: torch.Size([7, 14, 768]), example: tensor([[ 0.3040, -1.6375,  0.9913,  ...,  0.6738,  1.7467,  0.5538],
        [-0.4647,  0.5055, -0.9381,  ...,  3.0271,  3.8738, -0.3686],
        [ 3.6571, -1.3875, -0.9558,  ...,  1.5002,  2.6590,  1.7305],
        ...,
        [-2.0102, -1.6403, -0.8219,  ..., -1.0591,  1.9849, -4.5420],
        [-0.4976, -2.3514,  1.0389,  ...,  1.2063,  2.7370, -4.2313],
        [ 0.6042, -0.8540,  0.2265,  ..., -1.0613,  1.4576, -5.0178]],
       device='cuda:0')


In [None]:
it_words = [words[i] for i in it_indices.cpu().numpy()]
it_labels = labels[it_indices]

print(f"shape of it_labels: {it_labels.shape}, example: {it_labels}")
print(f"shape of it_words: {len(it_words)}, example: {it_words[0]}")

shape of it_labels: torch.Size([9, 14]), example: tensor([[1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0],
        [3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0],
        [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0],
        [3, 1, 2, 2, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0],
        [3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0],
        [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0],
        [3, 3, 3, 1, 3, 2, 2, 2, 0, 0, 0, 0, 0, 0],
        [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0],
        [3, 3, 3, 3, 3, 3, 3, 3, 1, 2, 2, 0, 0, 0]], device='cuda:0')
shape of it_words: 9, example: ['Prendi', 'con', 'le', 'pinze', 'quello', 'che', 'non', 'è', 'detto', 'da', 'esperti']


In [None]:
# embed tr list of embeddings herbirisi 10,seq_length,768 -> [tr_batch_size, seq_length, 768]
it_embedded = it_embedder.embed_sentences(it_words)
print(f"shape of embedded_it: {len(it_embedded)}, example: {it_embedded[0].shape}")
print(f"shape of embedded_it: {len(it_embedded)}, example: {it_embedded[1].shape}")

shape of embedded_it: 9, example: torch.Size([11, 768])
shape of embedded_it: 9, example: torch.Size([9, 768])


In [None]:
it_embs = pad_sequence(it_embedded, batch_first=True, padding_value=0).to(device)

# check embedding size to match the labels
if it_embs.size(1) < global_max:
    # ekstra pad lazım
    pad_size = global_max - it_embs.size(1)

    it_embs = F.pad(it_embs, (0, 0, 0, pad_size), "constant", 0)

print(f"shape of it_embs: {it_embs.shape}, example: {it_embs[0]}")

shape of it_embs: torch.Size([9, 14, 768]), example: tensor([[ 1.9155,  0.1461, -1.2802,  ...,  0.8907,  0.8552,  0.4204],
        [ 1.1491,  3.5726,  0.6087,  ..., -0.1244,  2.5063,  1.2722],
        [ 0.5323, -0.2845,  0.3650,  ..., -2.2384,  2.5075,  1.3367],
        ...,
        [-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       device='cuda:0')


In [None]:
tr_LL, _ = tr_model(tr_embs, tr_labels)
tr_NLL = -tr_LL.sum() / len(tr_indices)

it_LL,_ = it_model(it_embs, it_labels)
it_NLL = -it_LL.sum() / len(it_indices)

print(f"tr_NLL: {tr_NLL}")
print(f"it_NLL: {it_NLL}")
loss = tr_NLL + it_NLL
print(f"loss: {loss}")



tr_optimizer = optim.Adam(tr_model.parameters(), lr=0.0001)
it_optimizer = optim.Adam(it_model.parameters(), lr=0.0001)

# Optimizer step
tr_optimizer.zero_grad()
it_optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(tr_model.parameters(), 1)
torch.nn.utils.clip_grad_norm_(it_model.parameters(), 1)
tr_optimizer.step()
it_optimizer.step()


tr_NLL: 14.678117752075195
it_NLL: 17.433109283447266
loss: 32.111228942871094


In [None]:
# get one batch from dev set
eval_data = next(iter(dev_dataloader))
words, labels, langs = eval_data

# tr ve it nin indexlerini ayıkla
tr_indices = (langs == 0).nonzero(as_tuple=True)[0]
it_indices = (langs == 1).nonzero(as_tuple=True)[0]

batch_size, seq_len = labels.shape
device = labels.device
global_max = seq_len

hidden_size = tr_embedder.bert_model.config.hidden_size

# TR cümleleri ayıkla, embeddle ve petle
tr_words  = [words[i] for i in tr_indices.cpu().numpy()]
tr_labels = labels[tr_indices]
# türkçe cümle gelmiş mi kontrolü
if len(tr_words) > 0:
    tr_embedded = tr_embedder.embed_sentences(tr_words)
    tr_embs     = pad_sequence(tr_embedded, batch_first=True, padding_value=0).to(device)
    if tr_embs.size(1) < global_max:
        # ekstra pad lazım
        pad_size = global_max - tr_embs.size(1)
        tr_embs  = F.pad(tr_embs, (0, 0, 0, pad_size), "constant", 0)
else:
    # hiç TR cümlesi yoksa boş batch oluştur
    tr_embs = torch.zeros((0, global_max, hidden_size), device=device)

# IT cümleleri ayıkla, embeddle ve petle
it_words    = [words[i] for i in it_indices.cpu().numpy()]
it_labels   = labels[it_indices]
if len(it_words) > 0:
    it_embedded = it_embedder.embed_sentences(it_words)
    it_embs     = pad_sequence(it_embedded, batch_first=True, padding_value=0).to(device)
    if it_embs.size(1) < global_max:
        # ekstra pad lazım
        pad_size = global_max - it_embs.size(1)
        it_embs  = F.pad(it_embs, (0, 0, 0, pad_size), "constant", 0)
else:
    # hiç IT cümlesi yoksa boş batch oluştur
    it_embs = torch.zeros((0, global_max, hidden_size), device=device)

# take predictions
# inference: forward(…, None) dönen list of lists
tr_decode = tr_model(tr_embs, None)
it_decode = it_model(it_embs, None)

# alt kısım GPT bakmak lazım 

def decode_to_tensor(decode_out, seq_len, device):
    # 1) list of lists → list of 1D tensors
    token_tensors = [torch.tensor(seq, dtype=torch.long, device=device)
                     for seq in decode_out]
    # 2) hiç prediction yoksa boş tensor
    if not token_tensors:
        return torch.zeros((0, seq_len), dtype=torch.long, device=device)
    # 3) pad_sequence ile batch_first ve padding_value=-1
    padded = pad_sequence(token_tensors, batch_first=True, padding_value=-1)
    # 4) eğer hâlâ seq_len’den kısa ise sağa pad et
    if padded.size(1) < seq_len:
        pad_amt = seq_len - padded.size(1)
        padded = F.pad(padded, (0, pad_amt), value=-1)
    return padded

# kullanım:
tr_pred = decode_to_tensor(tr_decode, seq_len, device)
it_pred = decode_to_tensor(it_decode, seq_len, device)


# list of lists → tensor’a çevir
if len(tr_decode) > 0:
    tr_pred = torch.full((len(tr_decode), seq_len),
                         fill_value=-1, dtype=torch.long, device=device)
    for i, seq in enumerate(tr_decode):
        tr_pred[i, :len(seq)] = torch.tensor(seq, device=device)
else:
    tr_pred = torch.zeros((0, seq_len), dtype=torch.long, device=device)

if len(it_decode) > 0:
    it_pred = torch.full((len(it_decode), seq_len),
                         fill_value=-1, dtype=torch.long, device=device)
    for i, seq in enumerate(it_decode):
        it_pred[i, :len(seq)] = torch.tensor(seq, device=device)
else:
    it_pred = torch.zeros((0, seq_len), dtype=torch.long, device=device)

# 1) Tam batch için pred tensor’ü oluştur
all_pred = torch.full(
    (batch_size, seq_len),
    fill_value=-1,                # pad bölgelerde -1 kalsın
    dtype=torch.long,
    device=device
)

# 2) TR ve IT tahminlerini ilgili index’lere yerleştir
all_pred[tr_indices] = tr_pred
all_pred[it_indices] = it_pred

# 3) Sadece gerçek token’lar üzerinde doğru/yanlış kontrolü
valid_mask   = labels.ne(-1)        # True olan yerler gerçek token
correct_mask = (all_pred == labels) & valid_mask

# 4) İsterseniz token‐seviye accuracy
token_accuracy = correct_mask.sum().float() / valid_mask.sum().float()
print(f"Token‐seviye accuracy: {token_accuracy:.4f}")

# 5) Veya örnek‐seviye (sentence‐seviye) doğru karar:
#    her cümlenin tüm token’ları doğru mu?
sent_correct     = correct_mask.all(dim=1) & valid_mask.any(dim=1)
# valid_mask.any(dim=1) ile “tamamen pad’li” örnekleri atlıyoruz
sentence_accuracy = sent_correct.sum().float() / sent_correct.numel()
print(f"Sentence‐seviye accuracy: {sentence_accuracy:.4f}")

# all_pred şimdi (batch_size, seq_len) shape’inde, 
# her satır orijinal sırasıyla tahminleri içeriyor.


Token‐seviye accuracy: 0.3403
Sentence‐seviye accuracy: 0.0000


In [None]:
all_pred

tensor([[0, 1, 0, 0, 3, 3, 0, 0, 0, 1, 0, 2, 3, 2, 3, 2, 3, 3],
        [0, 0, 0, 0, 1, 0, 3, 2, 2, 2, 2, 3, 2, 3, 2, 3, 2, 0],
        [1, 3, 2, 0, 2, 0, 0, 3, 0, 2, 2, 3, 1, 2, 3, 0, 0, 0],
        [2, 2, 0, 3, 2, 3, 0, 1, 1, 0, 3, 1, 0, 0, 0, 0, 1, 0],
        [1, 0, 3, 0, 3, 2, 2, 3, 2, 0, 3, 3, 2, 0, 0, 1, 1, 2],
        [0, 3, 0, 3, 0, 0, 0, 0, 1, 2, 3, 1, 3, 2, 3, 0, 3, 0],
        [3, 0, 2, 0, 0, 0, 3, 3, 2, 3, 1, 1, 2, 2, 3, 0, 0, 3],
        [1, 1, 0, 1, 0, 1, 3, 3, 0, 2, 3, 0, 0, 1, 0, 2, 0, 2],
        [2, 2, 0, 3, 0, 0, 3, 0, 3, 2, 0, 3, 0, 3, 2, 3, 2, 0],
        [3, 0, 0, 0, 3, 2, 2, 3, 2, 3, 3, 0, 3, 2, 3, 2, 0, 0],
        [3, 1, 3, 2, 2, 3, 2, 1, 3, 0, 3, 0, 0, 1, 3, 0, 3, 2],
        [0, 0, 1, 3, 3, 2, 2, 1, 1, 0, 3, 0, 0, 2, 3, 0, 1, 1],
        [3, 3, 2, 3, 0, 3, 2, 0, 2, 0, 0, 0, 0, 1, 2, 3, 3, 3],
        [0, 0, 1, 2, 2, 1, 0, 2, 3, 0, 0, 3, 0, 3, 0, 1, 3, 3],
        [0, 2, 3, 0, 3, 3, 0, 2, 3, 0, 2, 0, 0, 0, 3, 3, 2, 0],
        [2, 0, 0, 3, 2, 2, 3, 1, 3, 2, 3