# 🤖 ChaosAI: 奇妙な英語文を生成する小型AIモデル

このColabノートブックでは、TinyGPT-2（またはGPT2-small）を使用して、
奇妙で意味不明な英語の文を生成するAIをLoRAでファインチューニングします。

In [None]:
# ✅ 必要なライブラリをインストール
!pip install transformers datasets peft accelerate bitsandbytes

In [None]:
# ✅ モデルとTokenizerの読み込み
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map='auto')

In [None]:
# ✅ データの準備（例として奇妙な英語文を直接書く）
train_data = [
    {'text': "The broccoli rebelled against the moon's gravity."},
    {'text': "I drank the alphabet and coughed out philosophy."},
    {'text': "Yesterday, a spoon declared itself president of clouds."},
    {'text': "The sidewalk laughed when I stepped on an idea."},
    {'text': "Time forgot to tick, so I slept through reality."},
]

from datasets import Dataset
dataset = Dataset.from_list(train_data)

In [None]:
# ✅ LoRAの準備と学習設定
from peft import get_peft_model, LoraConfig, TaskType
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

# LoRA設定
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, r=8, lora_alpha=32, lora_dropout=0.1, bias="none"
)

model = get_peft_model(model, lora_config)

# Tokenize
def tokenize(example):
    return tokenizer(example['text'], truncation=True, padding='max_length', max_length=64)

tokenized_dataset = dataset.map(tokenize)

# 学習引数
training_args = TrainingArguments(
    output_dir="output",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    logging_steps=1,
    save_steps=10,
    save_total_limit=1,
    fp16=True,
)

# トレーナー
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

In [None]:
# ✅ テスト生成
input_text = "The chair decided"
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))