In [None]:
%pip install torch transformers
%pip install ipywidgets
%pip install -U "huggingface_hub[cli]"

In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AdamW
from transformers import AutoModelForCausalLM
from huggingface_hub import notebook_login
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import re

In [None]:
MODEL_NAME = "distilgpt2"  
LEARNING_RATE = 1e-4
EPOCHS = 20
BATCH_SIZE = 6
MAX_LEN = 50
FILE_NAME = 'model'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token="")
tokenizer.pad_token = tokenizer.eos_token  

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len=MAX_LEN):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encodings = self.tokenizer(
            self.texts[idx], truncation=True, padding="max_length", max_length=self.max_len, return_tensors="pt"
        )
        input_ids = encodings["input_ids"].squeeze()
        attention_mask = encodings["attention_mask"].squeeze()
        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": input_ids}

In [None]:
data = open('data/goida.txt').read() + open('data/doc77.txt').read()
texts = re.split(r'[.!?]', data)
texts = [t.strip() for t in texts if t]

In [None]:
dataset = TextDataset(texts, tokenizer)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
class SimpleTransformerModel(nn.Module):
    def __init__(self, model_name):
        super(SimpleTransformerModel, self).__init__()
        self.transformer = AutoModelForCausalLM.from_pretrained(model_name)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss if labels is not None else None
        logits = outputs.logits
        return loss, logits

In [None]:
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
def train_model(model, dataloader, optimizer, epochs=EPOCHS):
    model.train()
    total_steps = len(dataloader) * epochs 
    progress_bar = tqdm(total=total_steps, desc="Training Progress")

    for epoch in range(1, epochs + 1):
        progress_bar.set_description(f"Training Progress ")

        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss  

            loss.backward()
            optimizer.step()

            progress_bar.update(1)
            progress_bar.set_postfix(loss=loss.item(), epoch=epoch)

    progress_bar.close()

In [None]:
train_model(model, dataloader, optimizer)

In [None]:
def generate_text(model, tokenizer, prompt, max_len=MAX_LEN, temperature=0.7, top_k=50, top_p=0.9):
    model.eval()
    input_ids = tokenizer(prompt, return_tensors="pt", padding=True).input_ids

    attention_mask = input_ids != tokenizer.pad_token_id

    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_len,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        num_return_sequences=1,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,  
        use_cache=False
    )
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text


In [None]:
prompt = "Итак, "
for _ in range(5): print(generate_text(model, tokenizer, prompt, temperature=0.1, max_len=1000))

In [None]:
torch.save(model.state_dict(), f'models/{FILE_NAME}.pth')