In [1]:
pip install torch transformers datasets




In [2]:
from transformers import pipeline

generator = pipeline("text-generation", model="gpt2")

result = generator("Жил-был кот по имени Василий", max_length=40, num_return_sequences=1)

print(result[0]['generated_text'])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=40) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Жил-был кот по имени Василий на чтал на простий изый изабыть и изанаровичных и изил и изавал и изаваденно.

Мостий на тавловай постий и изый изабыть и изанаровичных и изаваденно.

Тодия зекваденно в береденно и изадание и изаваденно.

Местори спростич ук


In [3]:
import torch

text = "привет мир привет человек привет мир человек человек"

vocab = list(set(text.split()))
word_to_idx = {w: i for i, w in enumerate(vocab)}
idx_to_word = {i: w for w, i in word_to_idx.items()}

encoded = [word_to_idx[w] for w in text.split()]
print("Закодированный текст:", encoded)


Закодированный текст: [1, 0, 1, 2, 1, 0, 2, 2]


In [4]:
import torch.nn as nn

vocab_size = len(vocab)
embed_dim = 16

embedding = nn.Embedding(vocab_size, embed_dim)
x = torch.tensor(encoded)
emb = embedding(x)
print("Форма эмбеддингов:", emb.shape)

Форма эмбеддингов: torch.Size([8, 16])


In [5]:
class SelfAttention(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.query = nn.Linear(embed_dim, embed_dim)
        self.key = nn.Linear(embed_dim, embed_dim)
        self.value = nn.Linear(embed_dim, embed_dim)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        Q = self.query(x)
        K = self.key(x)
        V = self.value(x)

        scores = torch.matmul(Q, K.transpose(-2, -1)) / (embed_dim ** 0.5)
        weights = self.softmax(scores)
        out = torch.matmul(weights, V)
        return out


In [6]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, hidden_dim):
        super().__init__()
        self.attention = SelfAttention(embed_dim)
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, embed_dim)
        )
        self.ln1 = nn.LayerNorm(embed_dim)
        self.ln2 = nn.LayerNorm(embed_dim)

    def forward(self, x):
        att = self.attention(x)
        x = self.ln1(x + att)  # residual
        ff = self.ff(x)
        x = self.ln2(x + ff)   # residual
        return x


In [7]:
class MiniGPT(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.blocks = nn.ModuleList([
            TransformerBlock(embed_dim, hidden_dim) for _ in range(num_layers)
        ])
        self.fc_out = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        for block in self.blocks:
            x = block(x)
        logits = self.fc_out(x)
        return logits


In [8]:
import torch.optim as optim

embed_dim = 32
hidden_dim = 64
num_layers = 2
epochs = 200

model = MiniGPT(vocab_size, embed_dim, hidden_dim, num_layers)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

inputs = []
targets = []
for i in range(len(encoded)-1):
    inputs.append(encoded[i])
    targets.append(encoded[i+1])

inputs = torch.tensor(inputs)
targets = torch.tensor(targets)

for epoch in range(epochs):
    optimizer.zero_grad()
    logits = model(inputs.unsqueeze(0))
    loss = criterion(logits.squeeze(0), targets)
    loss.backward()
    optimizer.step()

    if epoch % 20 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item()}")


Epoch 0, Loss: 1.2746808528900146
Epoch 20, Loss: 0.672511875629425
Epoch 40, Loss: 0.6704326868057251
Epoch 60, Loss: 0.6697840690612793
Epoch 80, Loss: 0.6694726347923279
Epoch 100, Loss: 0.6693136096000671
Epoch 120, Loss: 0.6692155599594116
Epoch 140, Loss: 0.6691487431526184
Epoch 160, Loss: 0.6708196997642517
Epoch 180, Loss: 0.6692512631416321


In [10]:
def generate(model, start_word, length=5):
    model.eval()
    idx = word_to_idx[start_word]
    result = [idx]

    for _ in range(length):
        inp = torch.tensor(result).unsqueeze(0)
        logits = model(inp)
        next_idx = torch.argmax(logits[0, -1]).item()
        result.append(next_idx)

    return " ".join([idx_to_word[i] for i in result])

print(generate(model, "привет", length=10))


привет человек привет мир привет мир человек человек привет мир человек
