In [None]:
# 🤖 AllanGPTv1 — Обучение GPT-2 на русском в Google Colab

# 📦 Установка библиотек
!pip install -q transformers datasets accelerate

# 🔗 Подключение Google Drive
from google.colab import drive
drive.mount('/content/drive')

# 📚 Импорт
from transformers import TFGPT2LMHeadModel, GPT2TokenizerFast, DataCollatorForLanguageModeling
from datasets import load_dataset
import tensorflow as tf
import os

# 📥 Загрузка токенизатора и модели от Сбера
model_name = 'sberbank-ai/rugpt3small_based_on_gpt2'
tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
model = TFGPT2LMHeadModel.from_pretrained(model_name)

# 📂 Загрузка датасета opus100
dataset = load_dataset("opus100", lang1="ru", lang2="en", split="train")

# 🔧 Препроцессинг
def preprocess(example):
    return tokenizer(example['translation']['ru'], truncation=True, padding='max_length', max_length=128)

tokenized = dataset.map(preprocess, batched=True, remove_columns=dataset.column_names)

# 📊 Настройка данных
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

train_dataset = tokenized.to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    shuffle=True,
    batch_size=4,
    collate_fn=data_collator
)

# 🧠 Обучение модели
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer)
model.fit(train_dataset, epochs=1)

# 💾 Сохранение модели в Google Drive
save_path = "/content/drive/MyDrive/AllanGPTv1"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)