In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
cd "/content/gdrive/My Drive/Sequential_model/my_GPT"

/content/gdrive/My Drive/Sequential_model/my_GPT


In [None]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
import numpy as np
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
data = []
with open('math_dataset.txt', 'r') as file:
    for line in file:
        data.append(line)

In [None]:
data[:6]

['Q: John has 5 apples, and he gives 2 apples to his friend. How many apples does John have left?\n',
 'A: John has 3 apples left.\n',
 '\n',
 'Q: Sara bought a shirt for $20 and a pair of pants for $30. How much did she spend in total?\n',
 'A: Sara spent a total of $50.\n',
 '\n']

In [None]:
PAD_token = 0
SOS_token = 1
EOS_token = 2

class Lang:
    def __init__(self, name):
        self.name = name
        special_tokens = ['<PAD>', '<SOS>', '<EOS>', '<UNK>']
        for i in range(1000):
            special_tokens.append(str(i))
        self.word2index = {}#{"<PAD>":0, "<SOS>":1, "<EOS>": 2}
        self.word2count = {}#{"<PAD>":1, "<SOS>":1, "<EOS>": 1}
        self.index2word = {}#{0: "<PAD>", 1: "<SOS>", 2: "<EOS>"}
        self.n_words = 0#3  # Count PAD, SOS and EOS
        self.max_len = 0

        for t in special_tokens:
            self.word2index[t] = self.n_words
            self.word2count[t] = 1
            self.index2word[self.n_words] = t
            self.n_words += 1

    def addSentence(self, sentence):
        updated_word = self.splitWord(sentence)

        if len(updated_word) > self.max_len:
            self.max_len = len(updated_word)
        for word in updated_word:
            self.addWord(word)

    def splitWord(self, sentence):
        #======= add space for certain character ==========
        addSpaceBefore = [',', ':', '.', '?', '/']
        str1 = ""
        for char in sentence:
            if char in addSpaceBefore:
                str1 += " " + char
            else:
                str1 += char

        addSpaceAfter = ['.', '/', '$']
        str2 = ""
        for char in str1:
            if char in addSpaceAfter:
                str2 += char + " "
            else:
                str2 += char

        return str2.split()

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [None]:
q = "Q: Sara bought a shirt for $20.00 and a pair of pants for $30. How much did she spend in total?"
en = Lang("EN")
print(en.splitWord(q))

['Q', ':', 'Sara', 'bought', 'a', 'shirt', 'for', '$', '20', '.', '00', 'and', 'a', 'pair', 'of', 'pants', 'for', '$', '30', '.', 'How', 'much', 'did', 'she', 'spend', 'in', 'total', '?']


In [None]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in lang.splitWord(sentence)]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    #indexes.insert(0, SOS_token)
    indexes.append(lang.word2index["<EOS>"])
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1) #(1,t)

In [None]:
def prepareData():
    qa_pairs = []
    questions = []
    answers = []
    for i in range(0, len(data), 3):
        questions.append(data[i].replace("\n", ""))
        answers.append(data[i+1].replace("\n", ""))
        qa_pairs.append(data[i].replace("\n", "") + " " + data[i+1].replace("\n", ""))
    return qa_pairs, questions, answers

qa_pairs, questions, answers = prepareData()
print(qa_pairs[:2])

['Q: John has 5 apples, and he gives 2 apples to his friend. How many apples does John have left? A: John has 3 apples left.', 'Q: Sara bought a shirt for $20 and a pair of pants for $30. How much did she spend in total? A: Sara spent a total of $50.']


In [None]:
for i in range(len(qa_pairs)):
    en.addSentence(qa_pairs[i])
print(en.n_words)

1255


In [None]:
#tensorFromSentence(en, qa_pairs[0])

In [None]:
print(en.splitWord(qa_pairs[0]))
print(len(en.splitWord(qa_pairs[0])))
print(indexesFromSentence(en, qa_pairs[0]))
print(len(indexesFromSentence(en, qa_pairs[0])))

['Q', ':', 'John', 'has', '5', 'apples', ',', 'and', 'he', 'gives', '2', 'apples', 'to', 'his', 'friend', '.', 'How', 'many', 'apples', 'does', 'John', 'have', 'left', '?', 'A', ':', 'John', 'has', '3', 'apples', 'left', '.']
32
[1004, 1005, 1006, 1007, 9, 1008, 1009, 1010, 1011, 1012, 6, 1008, 1013, 1014, 1015, 1016, 1017, 1018, 1008, 1019, 1006, 1020, 1021, 1022, 1023, 1005, 1006, 1007, 7, 1008, 1021, 1016]
32


In [None]:
SOS_token = en.word2index['<SOS>']
EOS_token = en.word2index['<EOS>']
MAX_LENGTH = en.max_len #max sentence length

def get_dataloader(batch_size):

    qa_pairs, _, _ = prepareData() #input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
    """load all data (indexs) to train_data"""
    n = len(qa_pairs)
    input_ids = np.zeros((n, MAX_LENGTH+2), dtype=np.int32) # +<SOS> <EOS>
    target_ids = np.zeros((n, MAX_LENGTH+2), dtype=np.int32)

    for idx, x in enumerate(qa_pairs):
        #Sentence to indexs
        inp_ids = indexesFromSentence(en, x)
        tgt_ids = indexesFromSentence(en, x)
        #add <SOS> index
        inp_ids.insert(0, SOS_token)
        tgt_ids.insert(0, SOS_token)
        #add <EOS> index
        inp_ids.append(EOS_token)
        tgt_ids.append(EOS_token)
        #remove last token of inp, remove first token of tgt
        inp_ids.remove(inp_ids[-1])
        tgt_ids.remove(tgt_ids[0])
        input_ids[idx, :len(inp_ids)] = inp_ids #inp_ids =          <SOS> <This> <is> <example> <sentence> removed <PAD> <PAD> .... <PAD>
        target_ids[idx, :len(tgt_ids)] = tgt_ids #inp_ids = removed <This> <is> <example> <sentence> <EOS> <PAD> <PAD> .... <PAD>. when Y is <PAD> loss will not be computed

    train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
                               torch.LongTensor(target_ids).to(device))

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    return train_dataloader

In [None]:
train_dataloader = get_dataloader(2)
for inp, tgt in train_dataloader:
    print(inp.shape)
    print(tgt.shape)
    indexs = tgt[0].tolist()
    print(" ".join([en.index2word[idx] for idx in indexs]))

    for b in range(1): # batch dimension
        for t in range(len(tgt[0].tolist())-1): # time dimension
            context = inp[b, :t+1]
            target = tgt[b,t]
            print(f"when input is {context.tolist()} the target: {target}")

    break

    for i in range(len(indexs)-1):
        print(f"When input, x is {indexs[:1+i]}, target, Y is {indexs[i+1]}")


torch.Size([2, 52])
torch.Size([2, 52])
Q : Peter has 4 times as many marbles as Jack . If Jack has 8 marbles , how many marbles does Peter have ? A : Peter has 32 marbles . <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
when input is [1] the target: 1004
when input is [1, 1004] the target: 1005
when input is [1, 1004, 1005] the target: 1134
when input is [1, 1004, 1005, 1134] the target: 1007
when input is [1, 1004, 1005, 1134, 1007] the target: 8
when input is [1, 1004, 1005, 1134, 1007, 8] the target: 1161
when input is [1, 1004, 1005, 1134, 1007, 8, 1161] the target: 1136
when input is [1, 1004, 1005, 1134, 1007, 8, 1161, 1136] the target: 1018
when input is [1, 1004, 1005, 1134, 1007, 8, 1161, 1136, 1018] the target: 1049
when input is [1, 1004, 1005, 1134, 1007, 8, 1161, 1136, 1018, 1049] the target: 1136
when input is [1, 1004, 1005, 1134, 1007, 8, 1161, 1136, 1018, 1049, 1136] the target: 1137
when input i

---
# Model

In [None]:
#ref: https://pytorch.org/tutorials/beginner/translation_transformer.html

from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.nn import Transformer
import copy
import math
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size, dropout, maxlen = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(0) #(b=1, maxlen=seq_len, emb_size=d_model)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding):
        #token_embedding (b,t_src,c) / (b,t_tgt,c)
        seq_len = token_embedding.size(1)
        return self.dropout(token_embedding + self.pos_embedding[:,:seq_len]) #(b,t_src,c) + (b:1,t_src,c)

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        #tokens (b,t_src) / (b,t_tgt) dtype=torch.int64
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size) #self.embedding(tokens.long()) (b,t_src) / (b,t_tgt)

def generate_square_subsequent_mask(size):
    #size = t_tgt
    mask = (torch.triu(torch.ones((size, size), device=device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    #mask (t_tgt, t_tgt)
    return mask

def create_mask(src, tgt):
    #src (b,t_src)
    #tgt (b,t_tgt)
    src_seq_len = src.shape[1]
    tgt_seq_len = tgt.shape[1]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len) #tgt_mask(t_tgt, t_tgt)
    src_mask = torch.zeros((src_seq_len, src_seq_len), device=device).type(torch.bool) #src_mask(t_src,t_src)

    src_padding_mask = (src == PAD_token) #src_padding_mask (b,t_src)
    tgt_padding_mask = (tgt == PAD_token) #tgt_padding_mask(b,t_tgt)

    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward, dropout,
                 activation = F.relu, layer_norm_eps = 1e-5):
        super().__init__()
        #from .activation import MultiheadAttention
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=True)
        # Implementation of Feedforward model
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.activation = activation
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x, src_mask, src_key_padding_mask):
        #add first
        x = self.norm1(x + self._sa_block(x, src_mask, src_key_padding_mask))
        x = self.norm2(x + self._ff_block(x))

        """
        norm first:
        x = x + self._sa_block(self.norm1(x), src_mask, src_key_padding_mask)
        x = x + self._ff_block(self.norm2(x))
        """
        return x
    # self-attention block
    def _sa_block(self, x, attn_mask, key_padding_mask):
        x = self.self_attn(x, x, x,
                           attn_mask=attn_mask,
                           key_padding_mask=key_padding_mask,
                           need_weights=False)[0]
        return self.dropout1(x)

    # feed forward block
    def _ff_block(self, x):
        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
        return self.dropout2(x)

class TransformerEncoder(nn.Module):
    def __init__(self, encoder_layer, num_layers, norm):
        super().__init__()
        self.layers = nn.ModuleList([copy.deepcopy(encoder_layer) for i in range(num_layers)]) #nn.ModuleList([Block(channels[i], channels[i+1]) for i in range(len(channels)-1)])
        self.norm = norm

    def forward(self, src, mask=None, src_key_padding_mask=None):
        output = src
        for mod in self.layers:
            output = mod(output, src_mask=mask, src_key_padding_mask=src_key_padding_mask)
        output = self.norm(output) #paper dont have this
        return output

# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self, num_encoder_layers,
                 d_model, nhead, vocab_size,
                 dim_feedforward = 512, dropout = 0.1, activation = F.relu, layer_norm_eps = 1e-5):
        super().__init__()
        encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, activation, layer_norm_eps)
        encoder_norm = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)

        self.tok_emb = TokenEmbedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, dropout=dropout)

        self.generator = nn.Linear(d_model, vocab_size)

    def forward(self, src, src_mask=None, src_key_padding_mask=None): #memory mask is mask for cross-attention
        src_emb = self.positional_encoding(self.tok_emb(src))

        memory = self.encoder(src_emb, src_mask, src_key_padding_mask)
        #memory = (b, t_src, c)
        logits = self.generator(memory)
        #logits = (b, t_tgt, vocab)
        return logits

    """def encode(self, src, src_mask):
        return self.encoder(self.positional_encoding(self.src_tok_emb(src)), src_mask)

    def decode(self, tgt, memory, tgt_mask):
        return self.decoder(self.positional_encoding(self.tgt_tok_emb(tgt)), memory, tgt_mask)"""

In [None]:
VOCAB_SIZE = en.n_words
EMB_SIZE = 512 #for embedding & transformer
NHEAD = 8
FFN_HID_DIM = 512 #for fc layer
NUM_ENCODER_LAYERS = 3
PAD_token = en.word2index['<PAD>']

model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, EMB_SIZE, NHEAD, VOCAB_SIZE, FFN_HID_DIM)
model = model.to(device)

for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_token)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

In [None]:
BLOCK_SIZE = 100
MAX_TOKEN = 50
EOS_TOKEN = en.word2index['<EOS>']

def sentenceFromIndexes(lang, idx):
    idx = idx.tolist()
    return " ".join([lang.index2word[i] for i in idx])

def respond(sentence, model):
    model.eval()
    idx = indexesFromSentence(en, sentence)
    idx = torch.tensor(idx).unsqueeze(0)
    q_len = idx.shape[1]
    idx = idx.to(device)
    """print(f"input {idx}")
    print(sentenceFromIndexes(en, idx[0]))"""
    with torch.no_grad():
        # idx is (B, T) array of indices in the current context
        for _ in range(MAX_TOKEN):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -BLOCK_SIZE:]
            # get the predictions
            logits = model(idx_cond, None, None) #logits=(b,t,c)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)

            if idx_next == EOS_TOKEN:
                break
    return sentenceFromIndexes(en, idx[0, q_len:])


In [None]:
import random

question = random.choice(questions)
print(question)

Q: Alex has 5 times as many marbles as Lily. If Lily has 10 marbles, how many marbles does Alex have?



In [None]:
num_epoch = 5000
eval_interval = 100
eval_iters = 200

train_dataloader = get_dataloader(64)
print(question)
for iter in range(num_epoch):
    model.train()
    losses = 0
    count = 0

    """# every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1 or iter == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")"""

    # sample a batch of data
    for x, Y in train_dataloader:
        x, Y = x.to(device), Y.to(device)
        #x, Y (b,t)

        src_mask, tgt_mask, src_key_padding_mask, tgt_padding_mask = create_mask(x, x)

        y_pred = model(x, tgt_mask, src_key_padding_mask)
        #y_pred = (b,t,vocab_size)
        optimizer.zero_grad()

        loss = loss_fn(y_pred.reshape(-1, y_pred.shape[-1]), Y.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()
        count += 1

    print(f"Epoch: {iter+1} \t Train Loss: {(losses/count):.4f} \t{respond(question, model)}")


In [None]:
torch.save(model, "myGPT.pt")

In [None]:
for i in range(10):
    question = random.choice(questions)
    print(f"Question {i+1}, {question}")
    print(f"Generated Answer, {respond(question, model)}")
    print("")

Question 1, Q: A class has 30 students. If 1/4 of them are absent, how many students are present?
Generated Answer, A : There are 22 students present . <EOS>

Question 2, Q: A class has 35 students. If 3/7 of them are absent, how many students are present?
Generated Answer, A : There are 20 students absent . <EOS>

Question 3, Q: A recipe requires 3/4 cup of sugar. If you want to make 2 recipes, how many cups of sugar do you need?
Generated Answer, A : You need 1 . 5 cups of sugar . <EOS>

Question 4, Q: A school has 600 students. If 3/5 of the students are girls, how many boys are there in the school?
Generated Answer, A : There are 240 boys in the school . <EOS>

Question 5, Q: A car travels at an average speed of 70 kilometers per hour. How long will it take to travel a distance of 140 kilometers?
Generated Answer, A : It will take 2 hours . <EOS>

Question 6, Q: A store sells apples for $0.50 each and bananas for $0.25 each. If Lisa buys 6 apples and 8 bananas, how much does she sp

In [None]:
q = "Peter has 3 times as many car as Jack. If Jack has 8 car, how many marbles does Peter have?"
print(respond(q, model))

A : Peter has 32 marbles . <EOS>
