In [1]:
import pandas as pd

def load_qa_from_csv(excel_path: str):
    qa_pairs = []

    # Read Excel file
    df = pd.read_excel(excel_path)

    # Validate required columns
    if "prompt" not in df.columns or "response" not in df.columns:
        raise ValueError("Excel file must contain 'prompt' and 'response' columns")

    for _, row in df.iterrows():
        q = str(row["prompt"]).strip()
        a = str(row["response"]).strip()

        # Skip empty or NaN rows
        if q and a and q.lower() != "nan" and a.lower() != "nan":
            qa_pairs.append({
                "question": q,
                "answer": a
            })

    return qa_pairs

In [2]:
import math
import random
from typing import List

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset


# =====================
# CONFIG
# =====================
CFG = {
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "seq_len": 96,
    "batch_size": 32,
    "epochs": 20,
    "lr": 3e-4,
    "d_model": 192,
    "n_heads": 3,
    "n_layers": 3,   # encoder=3, decoder=3
    "d_ff": 768,
    "dropout": 0.2,
}


# =====================
# TOKENIZER
# =====================
class WordTokenizer:
    def __init__(self, texts: List[str], min_freq: int = 5):
        self.specials = ["<PAD>", "<UNK>", "<BOS>", "<EOS>"]
        freq = {}

        for t in texts:
            for w in self._pre_tokenize(t):
                freq[w] = freq.get(w, 0) + 1

        vocab = sorted([w for w, c in freq.items() if c >= min_freq])
        self.itos = self.specials + vocab
        self.stoi = {w: i for i, w in enumerate(self.itos)}

        self.pad_id = self.stoi["<PAD>"]
        self.unk_id = self.stoi["<UNK>"]
        self.bos_id = self.stoi["<BOS>"]
        self.eos_id = self.stoi["<EOS>"]

        self.vocab_size = len(self.itos)

    def _pre_tokenize(self, text):
        out, cur = [], ""
        for ch in text.lower():
            if ch.isalnum():
                cur += ch
            else:
                if cur:
                    out.append(cur)
                    cur = ""
                if not ch.isspace():
                    out.append(ch)
        if cur:
            out.append(cur)
        return out

    def encode(self, text):
        ids = [self.bos_id]
        for t in self._pre_tokenize(text):
            ids.append(self.stoi.get(t, self.unk_id))
        ids.append(self.eos_id)
        return ids

    def decode(self, ids):
        words = []
        for i in ids:
            if i < len(self.itos):
                w = self.itos[i]
                if w not in self.specials:
                    words.append(w)
        return " ".join(words)


# =====================
# DATASET
# =====================
class QADataset(Dataset):
    def __init__(self, data, tokenizer, seq_len):
        self.pad = tokenizer.pad_id
        self.seq_len = seq_len
        self.items = []

        for d in data:
            q = tokenizer.encode(d["question"])[:seq_len]
            a = tokenizer.encode(d["answer"])[:seq_len]
            self.items.append((q, a))

    def pad_seq(self, x):
        return x + [self.pad] * (self.seq_len - len(x))

    def __getitem__(self, idx):
        q, a = self.items[idx]
    
        enc = self.pad_seq(q)
        dec_in = self.pad_seq(a[:-1])
        dec_out = self.pad_seq(a[1:])
    
        enc_len = len(q)
        dec_len = len(a) - 1
    
        return {
            "enc": torch.tensor(enc),
            "dec_in": torch.tensor(dec_in),
            "dec_out": torch.tensor(dec_out),
            "enc_mask": torch.tensor([1]*enc_len + [0]*(self.seq_len-enc_len)),
            "dec_mask": torch.tensor([1]*dec_len + [0]*(self.seq_len-dec_len)),
        }


    def __len__(self):
        return len(self.items)


# =====================
# MODEL
# =====================
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]


class EncoderDecoderQA(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, n_layers, d_ff, dropout):
        super().__init__()

        self.emb = nn.Embedding(
            vocab_size,
            d_model,
            padding_idx=0
        )

        self.pos = PositionalEncoding(d_model, max_len=512)
        self.emb_dropout = nn.Dropout(dropout)

        enc_layer = nn.TransformerEncoderLayer(
            d_model, n_heads, d_ff, dropout, batch_first=True
        )
        dec_layer = nn.TransformerDecoderLayer(
            d_model, n_heads, d_ff, dropout, batch_first=True
        )

        self.encoder = nn.TransformerEncoder(enc_layer, n_layers)
        self.decoder = nn.TransformerDecoder(dec_layer, n_layers)

        self.norm = nn.LayerNorm(d_model)

        self.head = nn.Linear(d_model, vocab_size, bias=True)
        self.head.weight = self.emb.weight

    def forward(self, enc_ids, dec_ids, enc_mask, dec_mask):
        enc = self.pos(self.emb(enc_ids))
        enc = enc.masked_fill(enc_ids.unsqueeze(-1) == 0, 0.0)
        enc = self.emb_dropout(enc)

        enc = self.encoder(
            enc,
            src_key_padding_mask=~enc_mask.bool()
        )

        dec = self.pos(self.emb(dec_ids))
        dec = dec.masked_fill(dec_ids.unsqueeze(-1) == 0, 0.0)
        dec = self.emb_dropout(dec)

        T = dec.size(1)
        tgt_mask = torch.triu(
            torch.full((T, T), float("-inf"), device=dec.device),
            diagonal=1
        )

        out = self.decoder(
            dec,
            enc,
            tgt_mask=tgt_mask,
            tgt_key_padding_mask=~dec_mask.bool(),
            memory_key_padding_mask=~enc_mask.bool(),
        )

        return self.head(self.norm(out))



from torch.nn.utils import clip_grad_norm_

def train_loop(model, dataloader, optimizer, device):
    model.train()

    criterion = nn.CrossEntropyLoss(ignore_index=0)  # PAD = 0
    total_steps = len(dataloader)

    for epoch in range(CFG["epochs"]):
        total_loss = 0.0

        for step, batch in enumerate(dataloader):
            enc_ids = batch["enc"].to(device)
            dec_in = batch["dec_in"].to(device)
            dec_out = batch["dec_out"].to(device)
            enc_mask = batch["enc_mask"].to(device)
            dec_mask = batch["dec_mask"].to(device)

            # Forward
            logits = model(
                enc_ids=enc_ids,
                dec_ids=dec_in,
                enc_mask=enc_mask,
                dec_mask=dec_mask,
            )

            # logits: [B, T, V]
            # targets: [B, T]
            loss = criterion(
                logits.view(-1, logits.size(-1)),
                dec_out.view(-1),
            )

            optimizer.zero_grad()
            loss.backward()

            # ðŸ”’ stabilize training
            clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()

            total_loss += loss.item()

            if step % 100 == 0:
                print(
                    f"Epoch [{epoch+1}/{CFG['epochs']}], "
                    f"Step [{step}/{total_steps}], "
                    f"Loss: {loss.item():.4f}"
                )

        avg_loss = total_loss / total_steps
        print(f"\nâœ… Epoch {epoch+1} finished | Avg loss: {avg_loss:.4f}\n")


In [3]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from pathlib import Path


if __name__ == "__main__":
    device = torch.device(CFG["device"])
    print("Using device:", device)

    # -------------------------
    # Load CSV from Kaggle
    # -------------------------
    csv_path = Path("/kaggle/input/startup-mentor-business-qa/startup_mentor_dataset.xlsx")
    if not csv_path.exists():
        raise SystemExit(f"CSV file not found at {csv_path}")

    data = load_qa_from_csv(csv_path)
    print(f"Loaded {len(data)} QA pairs")

    # -------------------------
    # Build tokenizer
    # -------------------------
    texts = [d["question"] + " " + d["answer"] for d in data]
    tokenizer = WordTokenizer(texts, min_freq=1)
    print("Vocab size:", tokenizer.vocab_size)

    # -------------------------
    # Dataset & Dataloader
    # -------------------------
    dataset = QADataset(data, tokenizer, CFG["seq_len"])
    dataloader = DataLoader(
        dataset,
        batch_size=CFG["batch_size"],
        shuffle=True,
        drop_last=True
    )

    # -------------------------
    # Model
    # -------------------------
    model = EncoderDecoderQA(
        vocab_size=tokenizer.vocab_size,
        d_model=CFG["d_model"],
        n_heads=CFG["n_heads"],
        n_layers=CFG["n_layers"],
        d_ff=CFG["d_ff"],
        dropout=CFG["dropout"]
    ).to(device)

    print("Model parameters:",
          sum(p.numel() for p in model.parameters()) // 1_000_000, "M")

    # -------------------------
    # Optimizer
    # -------------------------
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=CFG["lr"],
        weight_decay=0.01
    )

    # -------------------------
    # Train
    # -------------------------
    train_loop(model, dataloader, optimizer, device)

    # -------------------------
    # Save model
    # -------------------------
    torch.save({
        "model_state": model.state_dict(),
        "vocab": tokenizer.itos,
        "config": CFG
    }, "qa_encoder_decoder_from_scratch.pth")

    print("Training complete. Model saved.")


Using device: cuda
Loaded 62758 QA pairs
Vocab size: 68324




Model parameters: 16 M




Epoch [1/20], Step [0/1961], Loss: 68.1704
Epoch [1/20], Step [100/1961], Loss: 24.5498
Epoch [1/20], Step [200/1961], Loss: 17.1170
Epoch [1/20], Step [300/1961], Loss: 14.2563
Epoch [1/20], Step [400/1961], Loss: 11.1726
Epoch [1/20], Step [500/1961], Loss: 10.0702
Epoch [1/20], Step [600/1961], Loss: 8.8260
Epoch [1/20], Step [700/1961], Loss: 8.5764
Epoch [1/20], Step [800/1961], Loss: 7.9102
Epoch [1/20], Step [900/1961], Loss: 7.5515
Epoch [1/20], Step [1000/1961], Loss: 7.2196
Epoch [1/20], Step [1100/1961], Loss: 6.8714
Epoch [1/20], Step [1200/1961], Loss: 6.9677
Epoch [1/20], Step [1300/1961], Loss: 6.8175
Epoch [1/20], Step [1400/1961], Loss: 6.8439
Epoch [1/20], Step [1500/1961], Loss: 6.8545
Epoch [1/20], Step [1600/1961], Loss: 6.5177
Epoch [1/20], Step [1700/1961], Loss: 6.7719
Epoch [1/20], Step [1800/1961], Loss: 6.3172
Epoch [1/20], Step [1900/1961], Loss: 6.5807

âœ… Epoch 1 finished | Avg loss: 10.0342

Epoch [2/20], Step [0/1961], Loss: 6.3352
Epoch [2/20], Step [1

In [4]:
def interactive_chat(model, tokenizer, device):
    print("\nInteractive mode started.")
    print("Type your question and press Enter.")
    print("Type 'x' to exit.\n")

    model.eval()

    while True:
        user_input = input("You: ").strip()

        if user_input.lower() == "x":
            print("Exiting chat.")
            break

        if not user_input:
            continue

        with torch.no_grad():
            response = generate(
                model,
                tokenizer,
                user_input,  # prompt
                120,         # max_new_tokens
                0.9,         # temperature
                40           # top_k
            )

        print("Model:", response)
        print("-" * 60)


In [5]:
def generate(
    model,
    tokenizer,
    prompt,
    max_new_tokens=120,
    temperature=0.9,
    top_k=40,
    device=None
):
    model.eval()
    device = device or next(model.parameters()).device

    # -------- Encoder input --------
    enc_ids = tokenizer.encode(prompt)   # âœ… FIXED
    enc_ids = enc_ids[:CFG["seq_len"]]
    enc_ids += [tokenizer.pad_id] * (CFG["seq_len"] - len(enc_ids))

    enc_ids = torch.tensor([enc_ids], device=device)
    enc_mask = (enc_ids != tokenizer.pad_id)

    # -------- Decoder starts with BOS --------
    dec_ids = torch.tensor([[tokenizer.bos_id]], device=device)

    for _ in range(max_new_tokens):
        dec_mask = (dec_ids != tokenizer.pad_id)

        with torch.no_grad():
            logits = model(enc_ids, dec_ids, enc_mask, dec_mask)

        next_logits = logits[0, -1] / max(temperature, 1e-6)

        # block PAD token
        next_logits[tokenizer.pad_id] = -1e10

        # top-k sampling
        if top_k > 0:
            v, _ = torch.topk(next_logits, top_k)
            next_logits[next_logits < v[-1]] = -1e10

        probs = torch.softmax(next_logits, dim=-1)
        next_id = torch.multinomial(probs, 1).item()

        dec_ids = torch.cat(
            [dec_ids, torch.tensor([[next_id]], device=device)],
            dim=1
        )

        if next_id == tokenizer.eos_id:
            break

        if dec_ids.size(1) >= CFG["seq_len"]:
            break

    return tokenizer.decode(dec_ids[0].tolist())


In [6]:
interactive_chat(model, tokenizer, device)



Interactive mode started.
Type your question and press Enter.
Type 'x' to exit.



You:  how can i validate my idea?


Model: to validate your idea by using an digital application network ( a service and video / software ) through the user experience [ 2 ] [ 4 ] . you can also validate your idea , your code , and you can address a website called a mobile connection and the digital file to handle some , such as an audio , or a file or wi - fi [ 1 ] [ 4 ] . it works by using a user or software to create a digital or user app [ 4 ] .
------------------------------------------------------------


You:  thanks


Model: let !
------------------------------------------------------------


You:  hello


Model: feel happy as you see .
------------------------------------------------------------


You:  i want to open a startup in foods category what should i do?


Model: yes , you want to open a startup in food category to open a startup category . this is because it is a combination of negative factors , such as a person with the risk of a specific mental health problem , or an increase in a product , and the food is necessary [ 3 ] [ 4 ] . it can also contribute to your food taste and make a small amount of different foods . there is no evidence that it should be a bad product [ 5 ] , but it
------------------------------------------------------------


You:  ok so we still need data


Model: we still need data data by using a combination of factors , such as the internet , which are designed to keep data more complex and ensure [ 1 ] . it is also designed to maintain a network that is a lot of data that uses the system â€™ and not just to keep the data . additionally , it is not possible to be used to identify data from the app [ 1 ] [ 3 ] [ 5 ] .
------------------------------------------------------------


You:  x


Exiting chat.
