### libraries

In [3]:
import torch
import torch.nn as nn
from torch.nn import functional as F
print(torch.__version__)

device: str = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

# configuration
block_size = 8  # how many characters in context 
batch_size = 64   # how many sampler or block size that use in parallel compute 
max_iters = 1000
# eval_internal = 2500
learning_rate = 0.0004
eval_internal = 250

2.0.1+cu117
cuda


###  Convert words in text file to chars list 

In [5]:
with open('data/wizard_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    


# put words from .txt into char list
chars = sorted(set(text))
vocab_size = len(chars)
print("Size of vocab: ", vocab_size)
print("List of chars: ",chars)

Size of vocab:  81
List of chars:  ['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']


#### khmer | convert words in text file to chars list

In [6]:
with open('data/khmer_words.txt', 'r', encoding='utf-8') as k:
    khmer_txt = k.read()

chars_khm = sorted(set(khmer_txt))
vocab_size_khmer = len(chars_khm)
print("Size of vocab: ", vocab_size_khmer)
print("List of chars: ",chars_khm)

Size of vocab:  56
List of chars:  ['\n', ' ', 'ក', 'ខ', 'គ', 'ង', 'ច', 'ឆ', 'ជ', 'ញ', 'ដ', 'ណ', 'ត', 'ថ', 'ទ', 'ធ', 'ន', 'ប', 'ផ', 'ព', 'ភ', 'ម', 'យ', 'រ', 'ល', 'វ', 'ស', 'ហ', 'ឡ', 'អ', 'ឬ', 'ឱ', 'ា', 'ិ', 'ី', 'ឹ', 'ឺ', 'ុ', 'ូ', 'ួ', 'ើ', 'ៀ', 'េ', 'ែ', 'ោ', 'ៅ', 'ំ', 'ះ', '៉', '៊', '់', '៍', '៏', '្', '។', '\u200b']


### encode decode function

In [7]:
# create dic to map chars to int from 0 - len of chars
import torch
string_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_string = {i:ch for i, ch in enumerate(chars)}


# create encode function
encode = lambda s: [string_to_int[c] for c in s]

def encode2(s):
  encoded_list = []
  for c in s:
    encoded_list.append(string_to_int[c])
  return encoded_list
    
    
 # create decode function  
decode = lambda l: ''.join([int_to_string[i] for i in l])

def decode2(l):
  decoded_string = ''
  for i in l:
    decoded_string += int_to_string[i]
  return decoded_string


# test encode decode func 
encoded_word = encode2("llm from sratch")
print(encoded_word)

decoded_word = decode2(encoded_word)
print(decoded_word)

print('\n------------------ tensor dtype ------------------\n')

# test encode decode func with tensor dtype
encoded_word2 = torch.tensor(encode2("llm from sratch"), dtype=torch.long)
print(encoded_word2)
print(type(encoded_word2))

# decoded_word2 = decode(encoded_word2)
# print(decoded_word2)



[65, 65, 66, 1, 59, 71, 68, 66, 1, 72, 71, 54, 73, 56, 61]
llm from sratch

------------------ tensor dtype ------------------

tensor([65, 65, 66,  1, 59, 71, 68, 66,  1, 72, 71, 54, 73, 56, 61])
<class 'torch.Tensor'>


#### khmer encoder decoder function

In [13]:
khm_str_to_int = { ch:i for i, ch in enumerate(chars_khm)}
khm_int_to_str = { i:ch for i, ch in enumerate(chars_khm)}

khm_encode = lambda s: [khm_str_to_int[c] for c in s]
khm_decode = lambda l: ''.join([khm_int_to_str[i] for i in l])



khm_encode_word = khm_encode("ធនាគារ​អេស៊ីលីដា")
print(khm_encode_word)
print(len(khm_encode_word))

khm_decode_word = khm_decode(khm_encode_word)
print(khm_decode_word)

# test encode decode func with tensor dtype
khm_encoded_word2 = torch.tensor(khm_encode("ធនាគារ​អេស៊ីលីដា"), dtype=torch.long)
print(khm_encoded_word2)
print(type(khm_encoded_word2))


[15, 16, 32, 4, 32, 23, 55, 29, 42, 26, 49, 34, 24, 34, 10, 32]
16
ធនាគារ​អេស៊ីលីដា
tensor([15, 16, 32,  4, 32, 23, 55, 29, 42, 26, 49, 34, 24, 34, 10, 32])
<class 'torch.Tensor'>


### using torch

In [8]:
# convert data from plaint text to tensor dtype 
data = torch.tensor(encode(text), dtype=torch.long)
print(data[:50])
print(len(data))

# split data to train/test
n = int(0.8 * len(data))
train_data = data[:n]
val_data = data[n:]


tensor([80,  1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,
         1, 47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26,
        49,  0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1])
232309


In [9]:
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print('inputs:')
# print(x.shape)
print(x)
print('targets:')
print(y)

    

inputs:
tensor([[ 1, 54, 67, 57,  1, 60, 54, 71],
        [58, 57,  9,  1, 72, 73, 58, 71],
        [61, 58, 67,  1, 73, 61, 58,  1],
        [72,  1, 76, 58, 71, 58,  1, 72],
        [58,  1, 58, 77, 69, 71, 58, 72],
        [39, 72, 56, 54, 71,  1, 50, 68],
        [61, 58, 71,  1, 69, 65, 54, 56],
        [ 1, 72, 54, 62, 57,  1, 72, 68],
        [68, 71, 64,  1, 61, 62, 72,  1],
        [71, 58, 54, 57, 59, 74, 65,  2],
        [61, 58, 58, 71, 59, 74, 65, 65],
        [68, 74, 11,  3,  0,  0,  3, 44],
        [56, 62, 57, 58, 57,  1, 28, 68],
        [58, 76,  1, 66, 68, 66, 58, 67],
        [57,  1, 59, 65, 74, 59, 59, 78],
        [67, 60,  1, 46, 62, 67, 58, 72],
        [ 1, 56, 68, 66, 58,  1, 68, 74],
        [78, 11,  0,  0,  3, 39, 61,  9],
        [57,  1, 54, 71, 68, 74, 67, 57],
        [ 1,  1,  1, 16, 13,  0,  0,  1],
        [65,  1, 78, 68, 74,  1, 55, 74],
        [58, 72,  1, 68, 59,  1, 59, 65],
        [61, 58, 73, 61, 58, 71,  1, 78],
        [61, 58, 67,  1, 2

In [None]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        