In [1]:
import torch

In [3]:
a = torch.rand(2, 3)
a

tensor([[0.3185, 0.5773, 0.3334],
        [0.5209, 0.9920, 0.7909]])

In [4]:
[*a]

[tensor([0.3185, 0.5773, 0.3334]), tensor([0.5209, 0.9920, 0.7909])]

In [5]:
torch.cat([*a], dim=0)

tensor([0.3185, 0.5773, 0.3334, 0.5209, 0.9920, 0.7909])

In [2]:
import random
import math
import torch
from torch.utils.data import Dataset, DataLoader

# 定义一个简单的数据集
class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]

# 自定义 collate_fn
def my_collate_fn(batch):
    # 将批次数据按源句子长度排序
    batch.sort(key=lambda x: len(x[0]), reverse=True)
    
    # 分离源句子和目标句子
    src_sents = [item[0] for item in batch]
    tgt_sents = [item[1] for item in batch]
    
    # 根据需要转换成张量
    # src_sents = [torch.tensor(sent) for sent in src_sents]
    # tgt_sents = [torch.tensor(sent) for sent in tgt_sents]
    
    return src_sents, tgt_sents

# 示例数据
data = [(['hello', 'world'], ['你好', '世界']),
        (['goodbye'], ['再见']),
        (['how', 'are', 'you'], ['你', '好吗']),
        (['I', 'am', 'fine'], ['我', '很好'])]

# 创建数据集实例
dataset = MyDataset(data)

# 创建 DataLoader 实例，传入自定义的 collate_fn
data_loader = DataLoader(dataset, batch_size=2, collate_fn=my_collate_fn, shuffle=True)

# 迭代 DataLoader 并查看输出
for batch_src, batch_tgt in data_loader:
    print('Batch src:', batch_src)
    print('Batch tgt:', batch_tgt)


Batch src: [['how', 'are', 'you'], ['goodbye']]
Batch tgt: [['你', '好吗'], ['再见']]
Batch src: [['I', 'am', 'fine'], ['hello', 'world']]
Batch tgt: [['我', '很好'], ['你好', '世界']]


In [2]:
from collections import Counter
import numpy as np
import math
from typing import List

def compute_ngram(sentence, n: int):
    counter = Counter()
    for i in range(len(sentence) - n + 1):
        counter[tuple(sentence[i: i + n])] += 1
    return counter

def compute_blue(candidate, references: List[str], weight=[0.25] * 4):
    assert sum(weight) == 1, "Weights must sum up to 1"
    
    # Compute modified precision for n=1 to 4
    p = []
    for i in range(4):
        candidate_counter = compute_ngram(candidate.split(), n=i+1)
        ref_counters = [compute_ngram(sent.split(), n=i+1) for sent in references]
        
        numerator = 0
        for n_gram in candidate_counter:
            count = candidate_counter[n_gram]
            max_ref_count = max(ref_counter[n_gram] for ref_counter in ref_counters)
            numerator += min(max_ref_count, count)
        
        denominator = sum(candidate_counter.values())
        p_i = numerator / denominator if denominator != 0 else 0
        p.append(p_i)
    
    # Compute brevity penalty
    len_candidate = len(candidate.split())
    reference_lengths = [len(ref.split()) for ref in references]
    closest_ref_len = min(reference_lengths,
                          key=lambda ref_len: (abs(ref_len - len_candidate), ref_len))
    if len_candidate > closest_ref_len:
        BP = 1
    else:
        BP = math.exp(1 - closest_ref_len / len_candidate) if len_candidate != 0 else 0
    print(BP)
    
    # Compute BLEU score
    if min(p) > 0:
        bleu_score = BP * math.exp(sum(weight[i] * math.log(p[i]) for i in range(4)))
    else:
        bleu_score = 0
    
    return bleu_score


In [3]:
import nltk
from nltk.translate.bleu_score import sentence_bleu

def test_compute_blue():
    candidate = "I am Muqi Li"
    references = ["I am Muqi Li hello world"]
    
    # Custom BLEU calculation
    blue_score = compute_blue(candidate, references)
    
    # NLTK BLEU calculation
    candidate_tokens = candidate.split()
    reference_tokens = [ref.split() for ref in references]
    nltk_bleu_score = sentence_bleu(reference_tokens, candidate_tokens)
    
    print(f"Custom BLEU: {blue_score:.4f}")
    print(f"NLTK BLEU: {nltk_bleu_score:.4f}")

test_compute_blue()


0.6065306597126334
Custom BLEU: 0.6065
NLTK BLEU: 0.6065


-- --

In [18]:
from torch import nn
import torch

In [19]:
embedding_layer = nn.Embedding(200, embedding_dim=12)

In [21]:
embedding_layer(torch.LongTensor([12, 21]))

tensor([[-0.3933,  0.2923,  1.0104,  0.7737, -0.0134, -2.3021, -0.1026, -1.4181,
          0.2057, -0.5339,  0.7247,  2.0076],
        [ 0.6525,  1.5984,  0.3935, -1.4660, -0.3545,  1.0624, -1.2419, -0.3716,
         -1.1960,  0.4022,  0.3508, -0.6016]], grad_fn=<EmbeddingBackward0>)

In [23]:
torch.LongTensor([12, 21]).dtype

torch.int64