# With Wandb pipline

### setup environment

In [2]:
%pip install transformers datasets
%pip install wandb
%pip install torch
%pip install tqdm

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip






[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


### import

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# import torchvision
# import torchvision.transforms as transforms

from tqdm.auto import tqdm

In [4]:
import wandb

%wandb login
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjasonjasonyehyeh[0m ([33mfalcon-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

## Data Preprocessing

In [5]:
import pandas as pd
import random

from CangjieKeyMap import CangjieKeyMap

random.seed(42)


def cangjie_mixer():
    CHINESE_NEWS_TXT_PATH = ".\dataset\\chinese_news.txt"
    CHINESE_NEWS_CANGJIE_MIX_PATH = ".\dataset\chinese_news_cangjie_mix.txt"

    cangjie_key_map = CangjieKeyMap()

    with open(CHINESE_NEWS_TXT_PATH, "r", encoding="utf-8") as news_file:
        with open(CHINESE_NEWS_CANGJIE_MIX_PATH, "w", encoding="utf-8") as news_mix_file:
            for line in news_file:
                line = line.strip()
                cangjie_line = cangjie_key_map.convert_to_cangjie_key(line)
                news_mix_file.write(cangjie_line + "\t" + line + "\n")

# cangjie_mixer()

## Training

### Configs

In [6]:
# Gobal variables
PROGJECT_NAME = 'WIDM-MuIME'
DATASET_ROOT_PATH = "./datasets"
MODEL_SAVE_PATH = "./models"
CHINESE_NEWS_CANGJIE_MIX_DATASETS_PATH = "./datasets/chinese_news_cangjie_mix.txt"

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [22]:
config = dict(
    epochs = 10,
    learning_rate = 0.001,
    batch_size = 16,
    optimizer = "Adam",
    # classes=10,
    # kernels=[16, 32],
    # batch_size=128,
    # learning_rate=0.005,
    # dataset="MNIST",
    architecture = "Transformer",
)

## Pipline

### Model

In [8]:
from typing import Any, Callable, Optional, Union


from torch import Tensor
from torch.nn import functional as F


class Cangjie2ChineseTransformer(nn.Transformer):
    def __init__(self) -> None:
        super().__init__()
        self.model_type = 'Transformer'
    


### Dataset

In [9]:
from torch.utils.data import Dataset

class MyDataSet(Dataset):
    def __init__(self, X, Y, transform=None):
        self.data = X
        self.labels = Y

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index], self.labels[index]

### train

In [10]:
def train(model, train_loader, val_loader, criterion, optimizer, config):
    print("start training...")
    # Tell wandb to watch what the model gets up to: gradients, weights, and more!
    wandb.watch(model, criterion, log="all", log_freq=10)

    # Run training and track with wandb
    total_batches = len(train_loader) * config.epochs
    example_ct = 0  # number of examples seen
    batch_ct = 0
    for epoch in tqdm(range(config.epochs)):

        model.train()
        train_loss, train_acc = 0.0, 0.0
        for t_batch, (inputs, labels) in enumerate(train_loader):
            batch_loss = train_batch(inputs, labels, model, optimizer, criterion)
            batch_acc = -404 # todo: calculate acc
            example_ct +=  len(inputs)
            batch_ct += 1

            train_loss += batch_loss.item()

            # Report metrics every 25th batch
            if ((batch_ct + 1) % 25) == 0:
                train_log(batch_loss, example_ct, epoch)
    
            print("\r[ Epoch{}: {}/{} ] loss:{:.3f} acc:{:.3f}%".format(epoch+1, i+1, t_batch, batch_loss.item(), "-404"), end="")
        

        print('\nTrain | Loss:{:.5f} Acc: {:.3f}%'.format(train_loss, train_acc))
        # ==================== valid =====================
        model.eval()
        with torch.no_grad():
            for i, (inputs, labels) in enumerate(val_loader):
                val_loss = val_batch(inputs, labels, model, criterion)

        # todo: if val > train safe model
        # save with wandb.save


def train_batch(inputs, labels, model, optimizer, criterion):
    inputs, labels = inputs.to(device), labels.to(device)

    # Forward pass ➡
    outputs = model(inputs)
    loss = criterion(outputs, labels)

    # Backward pass ⬅
    optimizer.zero_grad()
    loss.backward()

    # Step with optimizer
    optimizer.step()

    return loss


def train_log(loss, example_ct, epoch):
    # Where the magic happens
    wandb.log({"epoch": epoch, "loss": loss}, step=example_ct)
    print(f"Loss after {str(example_ct).zfill(5)} examples: {loss:.3f}")


def val_batch(inputs, labels, model, criterion):
    inputs, labels = inputs.to(device), labels.to(device)

    outputs = model(inputs)
    loss = criterion(outputs, labels)

    return loss

#### Get Dataset

In [11]:
import torch
def load_training_dataset():
    dataset_path = CHINESE_NEWS_CANGJIE_MIX_DATASETS_PATH

    X, Y = [], []
    with open(dataset_path, "r", encoding="utf-8") as dataset:
        lines = dataset.readlines()

    def one_hot_encode(x:str) -> torch.Tensor: # 可優化? 
        key_labels = ["`", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0", "-", "=", "q", "w", "e", "r", "t", "y", "u", "i", "o", "p", "[", "]", "\\",
                      "a", "s", "d", "f", "g", "h", "j", "k", "l", ";", "'", "shift", "z", "x", "c", "v", "b", "n", "m", ",", ".", "/", "ctrl", " "
        ]
        one_hot_matrix = torch.eye(len(key_labels))
        
        tensor_list = []
        for char in x:
            if char in key_labels:
                tensor_list.append(list(one_hot_matrix[key_labels.index(char)]))
            else:
                tensor_list.append([0] * len(key_labels))
            
        return torch.tensor(tensor_list)

    def chinese_word_encode(x:str) -> torch.Tensor: # todo
        return torch.zeros(len(x), 50)


    for i, line in enumerate(lines):
        x_str, y_str = line.strip().split("\t")
        x_tensor = one_hot_encode(x_str)
        y_tensor = chinese_word_encode(y_str)
        X.append(x_tensor)
        Y.append(y_tensor)
        print(x_tensor.shape, y_tensor.shape)
        if i > 10:
            break
    
    print("X:", len(X), "Y:", len(Y))
    return MyDataSet(X, Y)

# load_training_dataset()

In [17]:
import platform

def get_dataset(slice_rate=0.8):
    print("preparing datasets")
    full_dataset = load_training_dataset()
    
    at = int(len(full_dataset) * slice_rate)
    train_dataset = torch.utils.data.Subset(full_dataset, indices=range(0, at))
    val_dataset = torch.utils.data.Subset(full_dataset, indices=range(at,))

    return train_dataset, val_dataset


def make_loader(dataset, batch_size):
    NUM_OF_WORKERS = 1 if platform.system() == "Windows" else 2
    
    loader = torch.utils.data.DataLoader(dataset=dataset,
                                         batch_size=batch_size,
                                         shuffle=True,
                                         pin_memory=True, num_workers=NUM_OF_WORKERS)
    return loader

#### Make

Define model, criterion, and optimizer.

In [13]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [14]:


def make(config):
    # Make the data
    train, val = get_dataset()
    print(train.__len__(), val.__len__())

    train_loader = make_loader(train, batch_size=config.batch_size)
    val_loader = make_loader(val, batch_size=config.batch_size)

    # Make the model
    # model = ConvNet(config.kernels, config.classes).to(device)
    # model = Cangjie2ChineseTransformer()
    model = model = AutoModelForMaskedLM.from_pretrained("bert-base-chinese")

    # Make the loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(
        model.parameters(), lr=config.learning_rate)

    return model, train_loader, val_loader, criterion, optimizer

#### Pipline

In [15]:
def model_pipeline(hyperparameters):
    # tell wandb to get started
    with wandb.init(project=PROGJECT_NAME, config=hyperparameters):
      # access all HPs through wandb.config, so logging matches execution!
      config = wandb.config

      # make the model, data, and optimization problem
      model, train_loader, val_loader, criterion, optimizer = make(config)
      print(model)

      # and use them to train the model
      train(model, train_loader, val_loader, criterion, optimizer, config)

      # and test its final performance
    #   test(model, test_loader)

    return model

# Run

In [23]:
model = model_pipeline(config)

preparing datasets
torch.Size([118, 50]) torch.Size([25, 50])
torch.Size([64, 50]) torch.Size([16, 50])
torch.Size([109, 50]) torch.Size([25, 50])
torch.Size([67, 50]) torch.Size([17, 50])
torch.Size([91, 50]) torch.Size([22, 50])
torch.Size([114, 50]) torch.Size([22, 50])
torch.Size([147, 50]) torch.Size([32, 50])
torch.Size([138, 50]) torch.Size([32, 50])
torch.Size([84, 50]) torch.Size([25, 50])
torch.Size([108, 50]) torch.Size([25, 50])
torch.Size([135, 50]) torch.Size([30, 50])
torch.Size([116, 50]) torch.Size([26, 50])
X: 12 Y: 12
9 9


Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

  0%|          | 0/10 [00:00<?, ?it/s]

RuntimeError: DataLoader worker (pid(s) 10476) exited unexpectedly

#### todos

- [ ] use pip install -r requirements.txt / pip freeze
- [ ] set val > train save model
- [ ]

In [None]:
# from transformers import AutoTokenizer
# from transformers import AutoModel

# checkpoint = "bert-base-chinese"
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# sequence  = "早上好，yl3g;4cl3，蝴蝶"

# tokens = tokenizer.tokenize(sequence)

# print(tokens)
# print(inputs.input_ids.size())

{'input_ids': tensor([[  101,  3193,   677,  1962,  8024,   167,  8178,  8152,  8181,   132,
           125, 10753,  8152,  8024,  6078,  6079,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [None]:


# checkpoint = "bert-base-chinese"
# model = AutoModel.from_pretrained(checkpoint)
# print(model.config)

Downloading model.safetensors:  13%|█▎        | 52.4M/412M [00:13<01:32, 3.90MB/s]

KeyboardInterrupt: 

Downloading model.safetensors:  13%|█▎        | 52.4M/412M [00:32<01:32, 3.90MB/s]