In [2]:
!git clone https://github.com/MorningStarTM/Mixture-of-LLM-Expert.git

Cloning into 'Mixture-of-LLM-Expert'...
remote: Enumerating objects: 109, done.[K
remote: Counting objects: 100% (109/109), done.[K
remote: Compressing objects: 100% (64/64), done.[K
remote: Total 109 (delta 46), reused 96 (delta 40), pack-reused 0[K
Receiving objects: 100% (109/109), 33.08 KiB | 5.51 MiB/s, done.
Resolving deltas: 100% (46/46), done.


In [3]:
%cd /kaggle/working/Mixture-of-LLM-Expert

/kaggle/working/Mixture-of-LLM-Expert


In [4]:
!ls

LICENSE  MoE  Models  README.md  Tokenizer  Utils  data  notebook


In [5]:
import os
import re
import json
from MoE import SparseMoELanguageModel, kaiming_init_weights
from Utils import model_params
from Tokenizer import WordLevelTokenizer, extract_and_save_text, LetterLevelTokenizer
import torch

In [25]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device
max_iters = 100000
learning_rate = 3e-4
block_size = 16
batch_size = 16
eval_iters = 500
n_emb = 384
n_layers = 6
n_head = 6

# Utility Function

In [9]:
def get_vocab_size(corpus):
    """
    Get the vocabulary size of the given corpus.

    Parameters:
    corpus (str): The text corpus to analyze.

    Returns:
    int: The size of the vocabulary (number of unique words and punctuation).
    """
    words = preprocess_text(corpus)
    unique_words = set(words)
    return len(unique_words)

def preprocess_text(text):
    """
    Preprocess the text by converting to lowercase and splitting into words and punctuation.

    Parameters:
    text (str): The text to preprocess.

    Returns:
    list: A list of words and punctuation.
    """
    text = text.lower()
    words = re.findall(r'\b\w+\b|[^\w\s]', text)
    return words

# Get Data

In [7]:
csv_file_path = '/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/test.csv'  # Replace with your CSV file path
output_directory = '/kaggle/working/'  # Replace with your desired output directory

extract_and_save_text(csv_file_path, output_directory)

In [8]:
with open("/kaggle/working/articles.txt", "rb") as txt:
    texts = txt.read()

# Token

In [10]:
texts_token = texts.decode()
texts_token_ount = texts_token.split()
len(texts_token_ount)

7853548

# Tokenizer

In [11]:
tokenizer = WordLevelTokenizer()
tokenizer.fit(texts_token)

Building Vocabulary: 100%|██████████| 92616/92616 [00:00<00:00, 645516.66it/s]


In [12]:
temp = "Im the President of the America."

tokens = tokenizer.tokenize(temp)
print("Tokens:", tokens)  # Output: Tokens: [index values representing each word]

original_text = tokenizer.detokenize(tokens)
print("Detokenized text:", original_text)

Tokens: [30574, 743, 37855, 89825, 743, 80881, 63146]
Detokenized text: im the president of the america .


In [13]:
vocab_size = get_vocab_size(texts_token)
print("Vocabulary Size:", vocab_size)

Vocabulary Size: 92616


In [14]:
data = tokenizer.tokenize(texts_token)

In [15]:
encoded_data = torch.tensor(data, dtype=torch.long)
print(encoded_data.shape, encoded_data.dtype)

torch.Size([9299088]) torch.int64


# Data Splitting

In [16]:
n = int(0.9*len(encoded_data))
train_data = encoded_data[:n]
val_data = encoded_data[n:]

print(f"Training tokens : {len(train_data)}  --- Validation tokens : {len(val_data)}")

Training tokens : 8369179  --- Validation tokens : 929909


In [26]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([7408]) the target: 84224
when input is tensor([ 7408, 84224]) the target: 41150
when input is tensor([ 7408, 84224, 41150]) the target: 4621
when input is tensor([ 7408, 84224, 41150,  4621]) the target: 32561
when input is tensor([ 7408, 84224, 41150,  4621, 32561]) the target: 45991
when input is tensor([ 7408, 84224, 41150,  4621, 32561, 45991]) the target: 16530
when input is tensor([ 7408, 84224, 41150,  4621, 32561, 45991, 16530]) the target: 60346
when input is tensor([ 7408, 84224, 41150,  4621, 32561, 45991, 16530, 60346]) the target: 40485
when input is tensor([ 7408, 84224, 41150,  4621, 32561, 45991, 16530, 60346, 40485]) the target: 14803
when input is tensor([ 7408, 84224, 41150,  4621, 32561, 45991, 16530, 60346, 40485, 14803]) the target: 24986
when input is tensor([ 7408, 84224, 41150,  4621, 32561, 45991, 16530, 60346, 40485, 14803,
        24986]) the target: 14803
when input is tensor([ 7408, 84224, 41150,  4621, 32561, 45991, 16530, 60346, 404

# Make Batch

In [27]:
torch.manual_seed(1337)

def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(encoded_data) - block_size, (batch_size,))
    x = torch.stack([encoded_data[i:i+block_size] for i in ix])
    y = torch.stack([encoded_data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print("Inputs: ")
print(xb)
print("Targets: ")
print(yb)

Inputs: 
tensor([[26495, 21803, 56851, 34989, 67137, 63701, 56851, 39752, 24842, 63146,
         77504, 12025, 56851, 83786, 56851, 15472],
        [ 9802, 63146, 87273, 32970, 70950, 52530, 50030,  1729, 33755, 41643,
         23606, 89825,   743, 81731, 11086, 33629],
        [61440,   743, 60047, 11086, 89825, 80361, 65614, 52530, 62673, 39044,
         66580,  4139, 48181, 70804, 63146,  1385],
        [75268, 31454, 85994,  1917, 17946, 13026, 59096, 35538, 31454, 81831,
         48058, 23401, 31196, 33629, 25237, 40074],
        [76059, 40691, 63146, 10342, 10642, 20368, 56851, 80178, 79601, 43720,
         34644, 36832, 80467, 11676, 10342, 31454],
        [37851, 41562, 64369, 31454, 38456, 16562, 89825, 16314, 90578, 52530,
         62673, 67137,  2855, 53707, 10465, 13204],
        [63215, 56851, 90850, 34794, 16530, 43581, 56851, 38833, 41562, 87525,
         50727, 16530, 63257, 16530, 76232, 32138],
        [70318,  3737, 57708,  2133, 34989,   743, 33275,  1917, 56837, 89

# Model - MoE - GPT

In [19]:
model = SparseMoELanguageModel(vocab_size)
model.apply(kaiming_init_weights)
model.to(device)

SparseMoELanguageModel(
  (token_embedding_table): Embedding(92616, 368)
  (position_embedding_table): Embedding(32, 368)
  (blocks): Sequential(
    (0): Block(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0-7): 8 x Head(
            (key): Linear(in_features=368, out_features=46, bias=False)
            (query): Linear(in_features=368, out_features=46, bias=False)
            (value): Linear(in_features=368, out_features=46, bias=False)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (proj): Linear(in_features=368, out_features=368, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (smoe): SparseMoE(
        (router): NoisyTopkRouter(
          (topkroute_linear): Linear(in_features=368, out_features=8, bias=True)
          (noise_linear): Linear(in_features=368, out_features=8, bias=True)
        )
        (experts): ModuleList(
          (0-7): 8 x Expert(
            (net): Sequential(
          

# paramters

In [20]:
model_params(model)

Total parameters: 142,120,872
Trainable parameters: 142,120,872
Non-trainable parameters: 0


In [21]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            X, Y = X.to(device), Y.to(device)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

# Training

In [28]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"steps: {iter} train loss: {losses['train']} val loss: {losses['val']}")
        
    xb, yb = get_batch('train')
    xb = xb.to(device)
    yb = yb.to(device)

    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

steps: 0 train loss: 6.6596150398254395 val loss: 6.624687671661377
steps: 500 train loss: 5.746845245361328 val loss: 5.752355575561523
steps: 1000 train loss: 5.695250988006592 val loss: 5.673673629760742
steps: 1500 train loss: 5.664629936218262 val loss: 5.6663336753845215
steps: 2000 train loss: 5.650763511657715 val loss: 5.627066612243652
steps: 2500 train loss: 5.626317977905273 val loss: 5.630195617675781
steps: 3000 train loss: 5.609336853027344 val loss: 5.60875940322876
steps: 3500 train loss: 5.582660675048828 val loss: 5.58222770690918
steps: 4000 train loss: 5.552610397338867 val loss: 5.548210620880127
steps: 4500 train loss: 5.531296730041504 val loss: 5.5355544090271
steps: 5000 train loss: 5.532628059387207 val loss: 5.516678810119629
steps: 5500 train loss: 5.502021789550781 val loss: 5.493224620819092
steps: 6000 train loss: 5.503291606903076 val loss: 5.475855350494385
steps: 6500 train loss: 5.458742618560791 val loss: 5.487979888916016
steps: 7000 train loss: 5.

KeyboardInterrupt: 

In [31]:
model_path = '/kaggle/working/MoE_W7M.pth'
torch.save(model.state_dict(), model_path)

# Inference

In [29]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_word = model.generate(context, max_new_tokens=50)
seq = " Batman is the "
for i in generated_word.tolist():
    seq = seq + ""+tokenizer.detokenize(i)

In [30]:
seq

" Batman is the godiva carson , 23 , all spoke to his sister and his nine - year - old new wife drew wardrobe boots left son soon , telling pal gerrard , said with , instagram resurrected xia chlebowski on deck indicated me launched costumed blowing ' ' ' , 66 moore enough"