TransformerとTokenizerは.pyで保存済みなため学習ループの実装する

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from GPT2 import Transformer, ModelConfig
from Tokenizer import Tokenizer

# データセットの作成
class TextDataset(Dataset):
    def __init__(self, text, tokenizer, block_size):
        self.tokenizer = tokenizer
        self.block_size = block_size
        self.tokens = self.tokenizer.encode(text, eot=True)

    def __len__(self):
        return len(self.tokens) - self.block_size

    def __getitem__(self, idx):
        return torch.tensor(self.tokens[idx:idx+self.block_size]), torch.tensor(self.tokens[idx+1:idx+self.block_size+1])

# データセットの読み込み
# 相対パスを指定してテキストファイルを読み込む
file_path1 = "../data/mini_ptb.train.txt"
file_path2 = "../data/mini_ptb.valid.txt"
file_path3 = "../data/mini_ptb.test.txt"

# ファイルを開いて内容を読み込む
with open(file_path1, "r", encoding="utf-8") as file:
    mini_ptb_train_text = file.read()

with open(file_path2, "r", encoding="utf-8") as file:
    mini_ptb_valid_text = file.read()

with open(file_path3, "r", encoding="utf-8") as file:
    mini_ptb_test_text = file.read()



# Tokenizerの初期化
unique_chars_in_train_text = sorted(list(set(mini_ptb_train_text)))
tokenizer = Tokenizer(unique_chars_in_train_text)

# モデルの初期化
device = 'cuda' if torch.cuda.is_available() else 'cpu'
block_size = 128
vocab_size = len(tokenizer.str_to_idx)
config = ModelConfig(
    block_size=128,    # シーケンス長
    vocab_size= vocab_size,  # ボキャブラリサイズ（例: GPTのボキャブラリ）
    n_layer=12,        # 層数
    n_embd=768,        # 埋め込み次元
    n_head=12,         # アテンションヘッド
    dropout=0.1        # ドロップアウト率
)

transformer = Transformer(config).to(device)

# データセットの作成
train_dataset = TextDataset(mini_ptb_train_text, tokenizer, block_size)
valid_dataset = TextDataset(mini_ptb_valid_text, tokenizer, block_size)
test_dataset = TextDataset(mini_ptb_test_text, tokenizer, block_size)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=2, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

# 学習ループの実装
def train(model, dataloader, optimizer, epochs, device):
    model.train()  # モデルを訓練モードに設定
    losses = []

    for epoch in range(epochs):
        running_loss = 0.0
        for i, (input_ids, target_ids) in enumerate(dataloader):
            input_ids, target_ids = input_ids.to(device), target_ids.to(device)

            # モデルの順伝播
            logits, loss = model(input_ids, targets=target_ids)

            # 損失の計算と逆伝播
            optimizer.zero_grad()  # 勾配の初期化
            loss.backward()  # 逆伝播
            optimizer.step()  # パラメータ更新

            running_loss += loss.item()
            losses.append(loss.item())

            if i % 100 == 99:  # 100ステップごとに進捗を表示
                print(f"Epoch [{epoch+1}/{epochs}], Step [{i+1}/{len(dataloader)}], Loss: {running_loss / 100:.4f}")
                running_loss = 0.0

    print("学習完了")
    return losses

  return torch._C._cuda_getDeviceCount() > 0


トランスフォーマーの総パラメータ数: 85.62M


In [3]:
optimizer = torch.optim.AdamW(transformer.parameters(), lr=3e-4)
train(transformer, train_loader, optimizer, epochs=1, device=device)

# モデルの評価
def evaluate(model, dataloader, device):
    model.eval()  # モデルを評価モードに設定
    total_loss = 0.0
    with torch.no_grad():
        for input_ids, target_ids in dataloader:
            input_ids, target_ids = input_ids.to(device), target_ids.to(device)

            # モデルの順伝播
            logits, loss = model(input_ids, targets=target_ids)
            total_loss += loss.item()

    return total_loss / len(dataloader)

# モデルの評価
val_loss = evaluate(transformer, valid_loader, device)
print(f"Validation Loss: {val_loss:.4f}")

Epoch [1/1], Step [100/60207], Loss: 2.4079
Epoch [1/1], Step [200/60207], Loss: 2.3676
Epoch [1/1], Step [300/60207], Loss: 2.3404
Epoch [1/1], Step [400/60207], Loss: 2.2909
Epoch [1/1], Step [500/60207], Loss: 2.2473
Epoch [1/1], Step [600/60207], Loss: 2.1701
Epoch [1/1], Step [700/60207], Loss: 2.1169
Epoch [1/1], Step [800/60207], Loss: 2.0700
Epoch [1/1], Step [900/60207], Loss: 2.0246
Epoch [1/1], Step [1000/60207], Loss: 2.0154
Epoch [1/1], Step [1100/60207], Loss: 1.9968
Epoch [1/1], Step [1200/60207], Loss: 1.9710
Epoch [1/1], Step [1300/60207], Loss: 1.9761
Epoch [1/1], Step [1400/60207], Loss: 1.9243
Epoch [1/1], Step [1500/60207], Loss: 1.9027
Epoch [1/1], Step [1600/60207], Loss: 1.8813
Epoch [1/1], Step [1700/60207], Loss: 1.8733
Epoch [1/1], Step [1800/60207], Loss: 1.8457
Epoch [1/1], Step [1900/60207], Loss: 1.8372
Epoch [1/1], Step [2000/60207], Loss: 1.8134
Epoch [1/1], Step [2100/60207], Loss: 1.7872
Epoch [1/1], Step [2200/60207], Loss: 1.7567
Epoch [1/1], Step [

In [28]:
save_data = {
    "model_state_dict": transformer.state_dict(),
    "config": config
}

torch.save(save_data, "transformer_with_config.pth")

print(transformer.state_dict().keys())

odict_keys(['transformer.wte.weight', 'transformer.wpe.weight', 'transformer.h.0.ln_1.weight', 'transformer.h.0.ln_1.bias', 'transformer.h.0.attn.bias', 'transformer.h.0.attn.c_attn.weight', 'transformer.h.0.attn.c_attn.bias', 'transformer.h.0.attn.c_proj.weight', 'transformer.h.0.attn.c_proj.bias', 'transformer.h.0.ln_2.weight', 'transformer.h.0.ln_2.bias', 'transformer.h.0.mlp.0.weight', 'transformer.h.0.mlp.0.bias', 'transformer.h.0.mlp.3.weight', 'transformer.h.0.mlp.3.bias', 'transformer.h.1.ln_1.weight', 'transformer.h.1.ln_1.bias', 'transformer.h.1.attn.bias', 'transformer.h.1.attn.c_attn.weight', 'transformer.h.1.attn.c_attn.bias', 'transformer.h.1.attn.c_proj.weight', 'transformer.h.1.attn.c_proj.bias', 'transformer.h.1.ln_2.weight', 'transformer.h.1.ln_2.bias', 'transformer.h.1.mlp.0.weight', 'transformer.h.1.mlp.0.bias', 'transformer.h.1.mlp.3.weight', 'transformer.h.1.mlp.3.bias', 'transformer.h.2.ln_1.weight', 'transformer.h.2.ln_1.bias', 'transformer.h.2.attn.bias', 'tran

In [None]:
# モデルの読み込み
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from GPT2 import Transformer, ModelConfig
from Tokenizer import Tokenizer

class TextDataset(Dataset):
    def __init__(self, text, tokenizer, block_size):
        self.tokenizer = tokenizer
        self.block_size = block_size
        self.tokens = self.tokenizer.encode(text, eot=True)

    def __len__(self):
        return len(self.tokens) - self.block_size

    def __getitem__(self, idx):
        return torch.tensor(self.tokens[idx:idx+self.block_size]), torch.tensor(self.tokens[idx+1:idx+self.block_size+1])

# 相対パスを指定してテキストファイルを読み込む
file_path1 = "../data/mini_ptb.train.txt"
file_path2 = "../data/mini_ptb.valid.txt"
file_path3 = "../data/mini_ptb.test.txt"

# ファイルを開いて内容を読み込む
with open(file_path1, "r", encoding="utf-8") as file:
    mini_ptb_train_text = file.read()

with open(file_path2, "r", encoding="utf-8") as file:
    mini_ptb_valid_text = file.read()

with open(file_path3, "r", encoding="utf-8") as file:
    mini_ptb_test_text = file.read()

unique_chars_in_train_text = sorted(list(set(mini_ptb_train_text)))


device = 'cuda' if torch.cuda.is_available() else 'cpu'
load_data = torch.load("transformer_with_config.pth")
config = load_data["config"]
transformer = Transformer(config)
transformer.load_state_dict(load_data["model_state_dict"])

block_size = 128

tokenizer = Tokenizer(unique_chars_in_train_text)
# データセットの作成
train_dataset = TextDataset(mini_ptb_train_text, tokenizer, block_size)
valid_dataset = TextDataset(mini_ptb_valid_text, tokenizer, block_size)
test_dataset = TextDataset(mini_ptb_test_text, tokenizer, block_size)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=2, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)


  return torch._C._cuda_getDeviceCount() > 0
  load_data = torch.load("transformer_with_config.pth")


トランスフォーマーの総パラメータ数: 85.62M


In [4]:
# 必要なトークンをリストに収集
context = 'Th'
context_token_indexes = torch.tensor(tokenizer.encode(context)).unsqueeze(0).to(device)
generated_tokens = transformer.generate(context_token_indexes, max_new_tokens=1, temperature=0.2, top_k=40)
generated_text = []

for token in generated_tokens[0]:
    generated_text.append(tokenizer.decode([token.item()]))

# 横向きに表示
print(' '.join(repr(token) for token in generated_text))

#縦向きに表示
#for token in generated_tokens[0]:
#    print(repr(tokenizer.decode([token.item()])))

'T' 'h' 'e'


In [10]:
#Perplexityの計算
import math

def calculate_perplexity(model, dataloader, device):
    model.eval()
    total_loss = 0.0
    total_tokens = 0

    with torch.no_grad():
        for input_ids, target_ids in dataloader:
            input_ids, target_ids = input_ids.to(device), target_ids.to(device)
            _, loss = model(input_ids, targets=target_ids)
            total_loss += loss.item()* input_ids.size(1)
            total_tokens += input_ids.size(1)

    perplexity = math.exp(total_loss / total_tokens)
    return perplexity

In [11]:
# 例: Perplexityの計算
perplexity = calculate_perplexity(transformer, train_loader, device)
print(f"Perplexity: {perplexity:.2f}")

Perplexity: 1.22


In [2]:
#文法的誤り訂正タスク
def correct_sentence(model, tokenizer, sentence, max_new_tokens=50):
    """
    文法的誤りを含む文を訂正し、正しい文を生成する関数。

    Args:
        model: 学習済みの言語モデル（Transformerモデル）。
        tokenizer: トークナイザー（入力文をトークン化し、出力文をデコード）。
        sentence (str): 文法的誤りを含む入力文。
        max_new_tokens (int): 生成する最大トークン数。

    Returns:
        corrected_sentence (str): 訂正された文法的に正しい文。
    """
    model.eval()  # モデルを推論モードに設定
    # 入力文をトークン化し、テンソルに変換
    token_ids = torch.tensor(tokenizer.encode(sentence, eot=True), dtype=torch.long).unsqueeze(0).to(device)

    # モデルによるテキスト生成
    with torch.no_grad():  # 勾配計算を無効化し、推論を高速化
        output_ids = model.generate(token_ids, max_new_tokens=max_new_tokens)

    # 生成されたトークンをデコードして文字列に戻す
    corrected_sentence = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return corrected_sentence

In [4]:
input_sentence = "I am go to school"

corrected_sentence = correct_sentence(transformer, tokenizer, input_sentence)
print(f"入力文: {input_sentence}")
print(f"訂正後の文: {corrected_sentence}")

入力文: I am go to school
訂正後の文: <unk_74><unk_258><unk_275><unk_287><unk_258><unk_281><unk_289><unk_258><unk_294><unk_289><unk_258><unk_293><unk_277><unk_282><unk_289><unk_289><unk_286><unk_0><unk_258><unk_275><unk_288><unk_278><unk_258><unk_294><unk_279><unk_286><unk_279><unk_277><unk_294><unk_283><unk_289><unk_288><unk_258><unk_289><unk_280><unk_258><unk_294><unk_282><unk_279><unk_258><unk_293><unk_294><unk_295><unk_278><unk_279><unk_288><unk_294><unk_293><unk_258><unk_293><unk_282><unk_279><unk_279><unk_294><unk_293><unk_258><unk_275><unk_288><unk_278><unk_258><unk_281><unk_275><unk_296><unk_279><unk_258><unk_294><unk_282><unk_279>


In [7]:
# 生成されたトークンIDのリストを取得
generated_ids = tokenizer.encode(corrected_sentence, eot=True)

# トークンIDの範囲確認
vocab_size = len(tokenizer.idx_to_str)
print("Vocabulary size:", vocab_size)

for token_id in generated_ids:
    if token_id >= vocab_size:
        print(f"Out-of-range token ID detected: {token_id}")


Vocabulary size: 301


In [8]:
# クローズテスト
def cloze_test(model, tokenizer, masked_sentence, mask_token="<mask>", max_length=50):
    """
    マスクされた部分を予測するタスク。
    """
    model.eval()
    input_ids = tokenizer.encode(masked_sentence, return_tensors="pt").to(device)
    input_ids = input_ids[:, :-1]  # マスクされたトークンまでの入力

    with torch.no_grad():
        output_ids = model.generate(input_ids, max_new_tokens=max_length)

    completed_sentence = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return completed_sentence

# 例: クローズテスト
masked_sentence = "The cat sat on the <mask>."
completed_sentence = cloze_test(transformer, tokenizer, masked_sentence)
print("Masked:", masked_sentence)
print("Completed:", completed_sentence)


IndexError: too many indices for tensor of dimension 1

In [9]:
# 文法誤り検出
def detect_grammar_errors(model, tokenizer, sentences, threshold=5.0):
    results = []
    for sentence in sentences:
        token_ids = tokenizer.encode(sentence, return_tensors="pt").to(device)
        loss = model.compute_loss_for_sentence(token_ids[0].tolist(), tokenizer)
        is_correct = loss < threshold
        results.append((sentence, loss, is_correct))
    return results

# 例: 文法誤り検出
sentences = [
    "This is a correct sentence.",
    "She like apples.",
    "I am go to school."
]

results = detect_grammar_errors(transformer, tokenizer, sentences)
for sentence, loss, is_correct in results:
    print(f"Sentence: '{sentence}', Loss: {loss:.2f}, Correct: {is_correct}")


AttributeError: 'Transformer' object has no attribute 'device'

In [None]:
#生成文法評価