In [28]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from dataclasses import dataclass
import numpy as np
from tqdm.auto import tqdm
from contextlib import nullcontext
import os
import json
from sklearn.model_selection import train_test_split

In [29]:

class LayerNorm(nn.Module):
    def __init__(self, ndim, bias):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
    def forward(self, x):
        return F.layer_norm(x, self.weight.shape, self.weight, self.bias, 1e-5)

class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.flash = hasattr(F, 'scaled_dot_product_attention')
        if not self.flash:
            self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                                       .view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size()
        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

        if self.flash:
            y = F.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.attn_dropout.p if self.training else 0.0, is_causal=True)
        else:
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
            att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
            att = F.softmax(att, dim=-1)
            att = self.attn_dropout(att)
            y = att @ v

        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.resid_dropout(self.c_proj(y))
        return y

class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
        self.gelu = nn.GELU()
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
        self.dropout = nn.Dropout(config.dropout)
    def forward(self, x):
        return self.dropout(self.c_proj(self.gelu(self.c_fc(x))))

class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln1 = LayerNorm(config.n_embd, config.bias)
        self.attn = CausalSelfAttention(config)
        self.ln2 = LayerNorm(config.n_embd, config.bias)
        self.mlp = MLP(config)
    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.mlp(self.ln2(x))
        return x

@dataclass
class GPTConfig:
    block_size: int
    vocab_size: int
    n_layer: int
    n_head: int
    n_embd: int
    dropout: float = 0.0
    bias: bool = True

class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.transformer = nn.ModuleDict(dict(
            wte=nn.Embedding(config.vocab_size, config.n_embd),
            wpe=nn.Embedding(config.block_size, config.n_embd),
            drop=nn.Dropout(config.dropout),
            h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f=LayerNorm(config.n_embd, config.bias),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.transformer.wte.weight = self.lm_head.weight  # weight tying

        self.apply(self._init_weights)
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                nn.init.normal_(p, mean=0.0, std=0.02 / math.sqrt(2 * config.n_layer))

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.config.block_size
        pos = torch.arange(0, t, dtype=torch.long, device=device)

        tok_emb = self.transformer.wte(idx)
        pos_emb = self.transformer.wpe(pos)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)

        if targets is not None:
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
            return logits, loss
        else:
            logits = self.lm_head(x[:, [-1], :])
            return logits, None

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        """
        Generate tokens given a conditioning sequence.
        idx: Tensor of shape (B, T)
        """
        for _ in range(max_new_tokens):
            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :] / temperature
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx


In [30]:
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))


True
Tesla T4


In [31]:
config = GPTConfig(
    vocab_size=50257,     # use the tokenizer's vocab size
    block_size=128,       # or whatever context size you're training with
    n_layer=6,
    n_head=6,
    n_embd=384,
    dropout=0.1,
    bias=True
)

model = GPT(config)

In [44]:
model.load_state_dict(torch.load("best_model_params_20000.pt"))
model.to("cuda")
model.train()

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 384)
    (wpe): Embedding(128, 384)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x Block(
        (ln1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=384, out_features=1152, bias=True)
          (c_proj): Linear(in_features=384, out_features=384, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=384, out_features=1536, bias=True)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=1536, out_features=384, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=384, out_features=50257, bias=False)
)

In [45]:
with open("fridge_dataset_v1.3_clean.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [46]:
# Можна спростити до input-output пари
pairs = [(item["instruction"], item["response"]) for item in data]
print(pairs[0])

('Where can consumers find detailed replacement instructions for lamps and control gear?', 'For detailed instructions on replacing lamps and control gear, consumers should visit the Samsung website and navigate to the "Support" section. By entering the model name, users can access specific guidance. Professional support is recommended as these components are not user-serviceable.')


In [47]:
import tiktoken
enc = tiktoken.get_encoding("gpt2")

block_size = config.block_size  # =128

def encode_pair(instruction, response):
    prompt = f"question: {instruction}\nanswer: {response}"
    tokens = enc.encode_ordinary(prompt)
    tokens = tokens[:block_size]  # обрізати або заповнити до block_size
    x = torch.tensor(tokens[:-1], dtype=torch.long)
    y = torch.tensor(tokens[1:], dtype=torch.long)
    return x, y

In [48]:
train_pairs, val_pairs = train_test_split(pairs, test_size=0.15, random_state=42)


In [49]:
from torch.utils.data import Dataset, DataLoader

class InstructionDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.samples = [encode_pair(instr, resp) for instr, resp in data]

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

In [50]:
def collate_fn(batch):
    # batch — список із (x, y) пар
    # Розпаковуємо
    xs, ys = zip(*batch)

    # Паддінг input і output (можна паддити токеном 0, або іншим padding_id)
    xs_padded = torch.nn.utils.rnn.pad_sequence(xs, batch_first=True, padding_value=0)
    ys_padded = torch.nn.utils.rnn.pad_sequence(ys, batch_first=True, padding_value=0)

    return xs_padded, ys_padded


In [51]:
train_dataset = InstructionDataset(train_pairs)
val_dataset = InstructionDataset(val_pairs)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)


In [52]:
def evaluate(model, val_loader):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to("cuda"), y.to("cuda")
            logits, loss = model(x, y)
            total_loss += loss.item()
    model.train()
    return total_loss / len(val_loader)


In [65]:
from torch.nn import functional as F
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)
model.train()

for epoch in range(30):  # наприклад, 3 епохи
    total_loss = 0.0
    for x, y in train_loader:
        x, y = x.to("cuda"), y.to("cuda")
        logits, loss = model(x, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_train_loss = total_loss / len(train_loader)
    avg_val_loss = evaluate(model, val_loader)

    print(f"Epoch {epoch+21}: train loss {avg_train_loss:.4f}, val loss {avg_val_loss:.4f}")


Epoch 21: train loss 0.3228, val loss 0.2756
Epoch 22: train loss 0.2972, val loss 0.2530
Epoch 23: train loss 0.2712, val loss 0.2379
Epoch 24: train loss 0.2599, val loss 0.2219
Epoch 25: train loss 0.2361, val loss 0.2116
Epoch 26: train loss 0.2167, val loss 0.2013
Epoch 27: train loss 0.2028, val loss 0.1899
Epoch 28: train loss 0.1973, val loss 0.1833
Epoch 29: train loss 0.1910, val loss 0.1778
Epoch 30: train loss 0.1793, val loss 0.1720
Epoch 31: train loss 0.1713, val loss 0.1672
Epoch 32: train loss 0.1719, val loss 0.1664
Epoch 33: train loss 0.1611, val loss 0.1604
Epoch 34: train loss 0.1536, val loss 0.1599
Epoch 35: train loss 0.1441, val loss 0.1574
Epoch 36: train loss 0.1417, val loss 0.1558
Epoch 37: train loss 0.1473, val loss 0.1580
Epoch 38: train loss 0.1584, val loss 0.1558
Epoch 39: train loss 0.1426, val loss 0.1539
Epoch 40: train loss 0.1350, val loss 0.1530
Epoch 41: train loss 0.1299, val loss 0.1485
Epoch 42: train loss 0.1280, val loss 0.1509
Epoch 43: 

In [70]:
torch.save(model.state_dict(), "chatgpt_1.3_gpt_50ep.pt")


In [16]:
def generate_response(instruction, max_new_tokens=64, temperature=0.8, top_k=40):
    prompt = f"question: {instruction}\nanswer:"
    input_ids = enc.encode_ordinary(prompt)
    input_ids = input_ids[:config.block_size]
    input_tensor = torch.tensor(input_ids, dtype=torch.long)[None].to("cuda")

    model.eval()
    with torch.no_grad():
        for _ in range(max_new_tokens):
            if input_tensor.shape[1] > config.block_size:
                input_tensor = input_tensor[:, -config.block_size:]

            logits, _ = model(input_tensor)
            logits = logits[:, -1, :] / temperature
            if top_k is not None:
                values, _ = torch.topk(logits, top_k)
                logits[logits < values[:, [-1]]] = -float("inf")

            probs = torch.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            input_tensor = torch.cat([input_tensor, next_token], dim=1)

    output_tokens = input_tensor[0].tolist()
    generated = enc.decode(output_tokens[len(input_ids):])
    return generated.strip()


In [67]:
for instruction, reference in val_pairs[:5]:
    generated = generate_response(instruction)

    print("Instruction:", instruction)
    print("Expected:", reference)
    print("Generated:", generated)
    print("-" * 50)

Instruction: What is the proper way to store cabbage, cauliflower, celery, cucumbers, and lettuce?
Expected: These vegetables should be stored in the refrigerator for up to 1 week but are not recommended for freezing.
Generated: Store these vegetables in the fridge for no more than a week; freezing is not recommended. Ignoring this can lead to frozen surfaces with no more than a temperature. Following these vegetables helps preserve its texture and prevents mold growth. Taking these vegetables will affect food quality. Once finished, and bought the freezer to normal temperature.
--------------------------------------------------
Instruction: How are the temperature ranges defined for the extended temperate climate class different in IEC compared to ISO standards?
Expected: Both IEC and ISO standards agree on the temperature limits for the extended temperate climate class (SN), establishing a range from +10 to +32°C.
Generated: For the extended temperate climate class (SN), both IEC and

In [None]:
import json
from tqdm import tqdm

results = []

# Проходимо по всім прикладам із валідаційного набору
for instruction, reference in tqdm(val_pairs):
    generated = generate_response(instruction)

    results.append({
        "instruction": instruction,
        "expected": reference,
        "generated": generated
    })

# Запис у JSON-файл
with open("gpt_val_pred.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print("✅ Збережено в gpt_train_pred.json")


100%|██████████| 561/561 [01:45<00:00,  5.32it/s]

✅ Збережено в gpt_train_pred.json





In [68]:
print(generate_response("How can I make a salad?"))
print(generate_response("What is the capital of France?"))
print(generate_response("Explain the theory of relativity in simple terms."))
print(generate_response("What are the benefits of regular exercise?"))
print(generate_response("What is the process of photosynthesis?"))

I apologize, but I am a refrigerator assistant and cannot help with cooking recipes. This practice ensures the food quality by suppressing bacterial activity. Go to the Support section, enter a plate filled with a plate inside this model name of professional help. However, using a multi-socket plate offers direct food. Because of food will
I apologize, but I am a refrigerator assistant and cannot help with web development or concepts. This preparation help maintain the best advice and safer operation. Having these items are completely intact uses the internet and might cause electrical risks. For more effectively, it is recommended to rely on professional technicians. Always adjust the temperature settings carefully to
I apologize, but I am a refrigerator assistant and cannot help with geographical information. This analysis can lead to regulatory issues or operational problems, making it crucial for users. It's crucial to follow the instructions carefully to avoid such issues and ensu

In [20]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.

In [69]:
from bert_score import score
from tqdm import tqdm

test_samples = val_pairs

references = [ex[1] for ex in test_samples]  # response
candidates = [generate_response(ex[0]) for ex in tqdm(test_samples)]  # instruction


P, R, F1 = score(
    candidates,
    references,
    lang="en",
    model_type="bert-base-uncased",
    device="cuda",
    batch_size=32
)

print(f"Precision: {P.mean().item():.4f}")
print(f"Recall:    {R.mean().item():.4f}")
print(f"F1 Score:  {F1.mean().item():.4f}")


100%|██████████| 500/500 [01:56<00:00,  4.30it/s]


Precision: 0.7044
Recall:    0.7929
F1 Score:  0.7434
