In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from dataclasses import dataclass
import numpy as np
from tqdm.auto import tqdm
from contextlib import nullcontext
import os
import json
from sklearn.model_selection import train_test_split

In [2]:

class LayerNorm(nn.Module):
    def __init__(self, ndim, bias):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
    def forward(self, x):
        return F.layer_norm(x, self.weight.shape, self.weight, self.bias, 1e-5)

class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.flash = hasattr(F, 'scaled_dot_product_attention')
        if not self.flash:
            self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                                       .view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size()
        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

        if self.flash:
            y = F.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.attn_dropout.p if self.training else 0.0, is_causal=True)
        else:
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
            att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
            att = F.softmax(att, dim=-1)
            att = self.attn_dropout(att)
            y = att @ v

        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.resid_dropout(self.c_proj(y))
        return y

class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
        self.gelu = nn.GELU()
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
        self.dropout = nn.Dropout(config.dropout)
    def forward(self, x):
        return self.dropout(self.c_proj(self.gelu(self.c_fc(x))))

class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln1 = LayerNorm(config.n_embd, config.bias)
        self.attn = CausalSelfAttention(config)
        self.ln2 = LayerNorm(config.n_embd, config.bias)
        self.mlp = MLP(config)
    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.mlp(self.ln2(x))
        return x

@dataclass
class GPTConfig:
    block_size: int
    vocab_size: int
    n_layer: int
    n_head: int
    n_embd: int
    dropout: float = 0.0
    bias: bool = True

class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.transformer = nn.ModuleDict(dict(
            wte=nn.Embedding(config.vocab_size, config.n_embd),
            wpe=nn.Embedding(config.block_size, config.n_embd),
            drop=nn.Dropout(config.dropout),
            h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f=LayerNorm(config.n_embd, config.bias),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.transformer.wte.weight = self.lm_head.weight  # weight tying

        self.apply(self._init_weights)
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                nn.init.normal_(p, mean=0.0, std=0.02 / math.sqrt(2 * config.n_layer))

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.config.block_size
        pos = torch.arange(0, t, dtype=torch.long, device=device)

        tok_emb = self.transformer.wte(idx)
        pos_emb = self.transformer.wpe(pos)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)

        if targets is not None:
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
            return logits, loss
        else:
            logits = self.lm_head(x[:, [-1], :])
            return logits, None

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        """
        Generate tokens given a conditioning sequence.
        idx: Tensor of shape (B, T)
        """
        for _ in range(max_new_tokens):
            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :] / temperature
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx


In [3]:
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))


True
Tesla T4


In [4]:
config = GPTConfig(
    vocab_size=50257,     # use the tokenizer's vocab size
    block_size=128,       # or whatever context size you're training with
    n_layer=6,
    n_head=6,
    n_embd=384,
    dropout=0.1,
    bias=True
)

model = GPT(config)

In [5]:
model.load_state_dict(torch.load("best_model_params_20000.pt"))
model.to("cuda")
model.train()

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 384)
    (wpe): Embedding(128, 384)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x Block(
        (ln1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=384, out_features=1152, bias=True)
          (c_proj): Linear(in_features=384, out_features=384, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=384, out_features=1536, bias=True)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=1536, out_features=384, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=384, out_features=50257, bias=False)
)

In [6]:
with open("fridge_dataset_v1.3_clean.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [7]:
# Можна спростити до input-output пари
pairs = [(item["instruction"], item["response"]) for item in data]
print(pairs[0])

('Where can consumers find detailed replacement instructions for lamps and control gear?', 'For detailed instructions on replacing lamps and control gear, consumers should visit the Samsung website and navigate to the "Support" section. By entering the model name, users can access specific guidance. Professional support is recommended as these components are not user-serviceable.')


In [8]:
import tiktoken
enc = tiktoken.get_encoding("gpt2")

block_size = config.block_size  # =128

def encode_pair(instruction, response):
    prompt = f"question: {instruction}\nanswer: {response}<|END|>"
    tokens = enc.encode_ordinary(prompt)
    if len(tokens) > block_size - 1:
        tokens = tokens[:block_size - 1] + [50256]  # Додаємо <|endoftext|> як токен кінця
    else:
        tokens = tokens + [50256] + [0] * (block_size - len(tokens) - 1)  # Паддінг до block_size
    x = torch.tensor(tokens[:-1], dtype=torch.long)
    y = torch.tensor(tokens[1:], dtype=torch.long)
    return x, y

In [9]:
# Аналіз довжини відповідей
for item in data:
    prompt = f"question: {item['instruction']}\nanswer: {item['response']}<|END|>"
    tokens = enc.encode_ordinary(prompt)
    if len(tokens) > block_size:
        print(f"Instruction: {item['instruction'][:50]}... has {len(tokens)} tokens, exceeds block_size={block_size}")

Instruction: Could you provide the frequency bands and maximum ... has 143 tokens, exceeds block_size=128
Instruction: Which methods are suggested for cleaning the refri... has 136 tokens, exceeds block_size=128
Instruction: How can one prevent the appliance from causing foo... has 146 tokens, exceeds block_size=128
Instruction: How can food contamination be prevented in the app... has 141 tokens, exceeds block_size=128
Instruction: What strategies can be implemented to prevent cont... has 152 tokens, exceeds block_size=128
Instruction: Could you provide the frequency bands and maximum ... has 141 tokens, exceeds block_size=128
Instruction: How can one prevent the appliance from causing foo... has 142 tokens, exceeds block_size=128
Instruction: How is it possible to stop food from becoming cont... has 149 tokens, exceeds block_size=128
Instruction: Can you specify the frequency ranges and highest o... has 135 tokens, exceeds block_size=128
Instruction: What measures can be taken to avo

In [10]:
train_pairs, val_pairs = train_test_split(pairs, test_size=0.15, random_state=42)


In [11]:
from torch.utils.data import Dataset, DataLoader

class InstructionDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.samples = [encode_pair(instr, resp) for instr, resp in data]

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

In [12]:
def collate_fn(batch):
    # batch — список із (x, y) пар
    # Розпаковуємо
    xs, ys = zip(*batch)

    # Паддінг input і output (можна паддити токеном 0, або іншим padding_id)
    xs_padded = torch.nn.utils.rnn.pad_sequence(xs, batch_first=True, padding_value=0)
    ys_padded = torch.nn.utils.rnn.pad_sequence(ys, batch_first=True, padding_value=0)

    return xs_padded, ys_padded


In [13]:
train_dataset = InstructionDataset(train_pairs)
val_dataset = InstructionDataset(val_pairs)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)


In [14]:
def evaluate(model, val_loader):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to("cuda"), y.to("cuda")
            logits, loss = model(x, y)
            total_loss += loss.item()
    model.train()
    return total_loss / len(val_loader)


In [15]:
from torch.nn import functional as F
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR

optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = CosineAnnealingLR(optimizer, T_max=50, eta_min=1e-6)
model.train()

for epoch in range(50):
    total_loss = 0.0
    for x, y in train_loader:
        x, y = x.to("cuda"), y.to("cuda")
        logits, loss = model(x, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    scheduler.step()
    avg_train_loss = total_loss / len(train_loader)
    avg_val_loss = evaluate(model, val_loader)

    print(f"Epoch {epoch+1}: train loss {avg_train_loss:.4f}, val loss {avg_val_loss:.4f}")


Epoch 1: train loss 3.1222, val loss 2.3109
Epoch 2: train loss 2.1530, val loss 1.8384
Epoch 3: train loss 1.7981, val loss 1.5687
Epoch 4: train loss 1.5614, val loss 1.3416
Epoch 5: train loss 1.3753, val loss 1.1780
Epoch 6: train loss 1.2193, val loss 1.0357
Epoch 7: train loss 1.0836, val loss 0.9072
Epoch 8: train loss 0.9685, val loss 0.8007
Epoch 9: train loss 0.8649, val loss 0.7207
Epoch 10: train loss 0.7792, val loss 0.6424
Epoch 11: train loss 0.7045, val loss 0.5761
Epoch 12: train loss 0.6323, val loss 0.5160
Epoch 13: train loss 0.5768, val loss 0.4666
Epoch 14: train loss 0.5228, val loss 0.4278
Epoch 15: train loss 0.4762, val loss 0.3901
Epoch 16: train loss 0.4374, val loss 0.3536
Epoch 17: train loss 0.3991, val loss 0.3257
Epoch 18: train loss 0.3669, val loss 0.2999
Epoch 19: train loss 0.3405, val loss 0.2786
Epoch 20: train loss 0.3159, val loss 0.2600
Epoch 21: train loss 0.2927, val loss 0.2429
Epoch 22: train loss 0.2733, val loss 0.2285
Epoch 23: train los

In [16]:
torch.save(model.state_dict(), "gpt_1.3_new_gpt_50ep.pt")


In [50]:
def generate_response(instruction, max_new_tokens=100, temperature=0.6, top_k=40):
    prompt = f"question: {instruction}\nanswer:"
    input_ids = enc.encode_ordinary(prompt)
    input_ids = input_ids[:config.block_size]
    input_tensor = torch.tensor(input_ids, dtype=torch.long)[None].to("cuda")

    model.eval()
    with torch.no_grad():
        for _ in range(max_new_tokens):
            if input_tensor.shape[1] > config.block_size:
                input_tensor = input_tensor[:, -config.block_size:]

            logits, _ = model(input_tensor)
            logits = logits[:, -1, :] / temperature
            if top_k is not None:
                values, _ = torch.topk(logits, top_k)
                logits[logits < values[:, [-1]]] = -float("inf")

            probs = torch.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            input_tensor = torch.cat([input_tensor, next_token], dim=1)

            # Зупиняємо генерацію, якщо згенеровано токен <|END|> (50256)
            if next_token.item() == 50256:
                break

    output_tokens = input_tensor[0].tolist()
    generated = enc.decode(output_tokens[len(input_ids):])
    return generated.strip().replace("<|END|><|endoftext|>", "")  # Видаляємо <|END|> із виводу


In [32]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.

In [41]:
from bert_score import score as bert_score
import logging
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)

from transformers.utils import logging
logging.set_verbosity_error()


In [51]:
for instruction, reference in train_pairs[:5]:
    generated = generate_response(instruction)

    # Порахувати BERTScore (F1) для цієї пари
    P, R, F1 = bert_score([generated], [reference], lang='en', verbose=False)

    print("Instruction:", instruction)
    print("Expected:", reference)
    print("Generated:", generated)
    print(f"BERTScore F1: {F1[0].item():.4f}")
    print("-" * 50)

Instruction: What benefits does a multipurpose freezer bin offer in specific models?
Expected: Designed to boost the storage capacity of select refrigerators, the multipurpose freezer bin offers adaptable freezer space. It helps users efficiently arrange and access their frozen goods, meeting a range of storage demands.
Generated: The multipurpose freezer bin is created to improve storage options in certain refrigerator models by providing flexible freezer space usage. This allows users to better organize and easily reach their frozen items, accommodating various storage requirements.
BERTScore F1: 0.9424
--------------------------------------------------
Instruction: Are there any precautions to take when thawing the appliance?
Expected: To avoid harm to the appliance and maintain safe defrosting, do not employ mechanical devices or alternative methods to hasten the process unless explicitly advised by the manufacturer.
Generated: Accelerating defrosting with mechanical equipment or o

In [None]:
import json
from tqdm import tqdm

results = []

# Проходимо по всім прикладам 1
for instruction, reference in tqdm(train_pairs):
    generated = generate_response(instruction)

    results.append({
        "instruction": instruction,
        "expected": reference,
        "generated": generated
    })

# Запис у JSON-файл
with open("chatgpt_1.3_new_gpt_train_pred.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print("✅ Збережено в gpt_train_pred.json")

results = []

# Проходимо по всім прикладам 2
for instruction, reference in tqdm(val_pairs):
    generated = generate_response(instruction)
    P, R, F1 = bert_score([generated], [reference], lang='en', verbose=False)

    results.append({
        "instruction": instruction,
        "expected": reference,
        "generated": generated,
        "BERTScore": f"{F1[0].item():.4f}"
    })

# Запис у JSON-файл
with open("chatgpt_1.3_new_gpt_val_pred.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print("✅ Збережено в gpt_val_pred.json")

 33%|███▎      | 947/2833 [02:52<07:08,  4.40it/s]

In [52]:
print(generate_response("How can I make a salad?"))
print(generate_response("What is the capital of France?"))
print(generate_response("Explain the theory of relativity in simple terms."))
print(generate_response("What are the benefits of regular exercise?"))
print(generate_response("What is the process of photosynthesis?"))

I apologize, but I am a refrigerator assistant and cannot help with cooking techniques.
I apologize, but I am a refrigerator assistant and cannot help with literary analysis.
I apologize, but I am a refrigerator assistant and cannot help with literary advice.
I apologize, but I am a refrigerator assistant and cannot help with sports rules.
I apologize, but I am a refrigerator assistant and cannot help with medical or literary processes.


In [None]:
from bert_score import score
from tqdm import tqdm

test_samples = val_pairs

references = [ex[1] for ex in test_samples]  # response
candidates = [generate_response(ex[0]) for ex in tqdm(test_samples)]  # instruction


P, R, F1 = score(
    candidates,
    references,
    lang="en",
    # model_type="bert-base-uncased",
    device="cuda",
    batch_size=32
)

print(f"Precision: {P.mean().item():.4f}")
print(f"Recall:    {R.mean().item():.4f}")
print(f"F1 Score:  {F1.mean().item():.4f}")


100%|██████████| 500/500 [01:29<00:00,  5.56it/s]


Precision: 0.8100
Recall:    0.8097
F1 Score:  0.8095
