In [None]:
from toolkit.nlp import TextDataset
from transformers import AutoTokenizer
from toolkit.enums import Split
from toolkit.nlp import NLPTrainingConfig
from load_data_fn import load_data_fn

DEFAULT_PAD_TOKEN = "[PAD]"

tokenizer =  AutoTokenizer.from_pretrained("pretrained_models/baichuan2-13b-chat", trust_remote_code=True)

if tokenizer.pad_token is None:
    print(f"Adding pad token {DEFAULT_PAD_TOKEN}")
    tokenizer.add_special_tokens(dict(pad_token=DEFAULT_PAD_TOKEN))

train_dataset = TextDataset.from_file(
    "data/hot_finetune_data/train.json",
    tokenizer,
    split=Split.TRAINING,
    configs=NLPTrainingConfig(train_batch_size=64),
    load_data_fn=load_data_fn,
)

In [None]:

print(tokenizer.decode(train_dataset[0]['model_input']['input_ids']))

In [None]:
print(train_dataset[0]['labels'])
print(tokenizer.decode(abs(train_dataset[0]['labels']), skip_special_tokens=False))

In [None]:
tokenizer("你好呀</s>", add_special_tokens=True)

In [None]:
32*0.04

In [None]:
round(-2.6)

In [None]:
import time
from pathlib import Path
import deepspeed
import hjson
import numpy as np
import toolkit
import torch
import torch.distributed as dist
from fire import Fire
from toolkit import getLogger
from toolkit.enums import Split
from toolkit.metric import MetricDict
from toolkit.nlp import TextDataset
from toolkit.training import Trainer, initialize
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForCausalLM,
    PreTrainedTokenizer,
    CONFIG_MAPPING,
)
from myconfig import MyTrainingConfig
from load_data_fn import load_data_fn
from toolkit.training.dataloader import get_dataloader
IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"
local_rank=0
logger = getLogger(__name__, "/dev/null")
config = MyTrainingConfig(parallel_mode="deepspeed" ,
    deepspeed_config ="./ds_zero3_offload_mod.hjson" ,
    dashboard ="tensorboard" ,
    model_dir ="./baichuan-13b-chat" ,
    train_file_path ="./data/hot_finetune_data/train.json" ,
    train_batch_size =8 ,
    gradient_accumulation_steps =1 ,
    seed =0 ,
    fp16 =True ,
    epochs =16 ,
    opt_lr ="1e-4" ,
    sch_warmup_ratio_steps =0.03 ,
    opt_weight_decay =0 ,
    ddp_timeout =30000 ,
    torch_dtype ="float16" ,
    logging_steps =1 ,
    padding_side ="left" ,)

def load_tokenizer() -> PreTrainedTokenizer:
    # * Load tokenizer
    tokenizer_kwargs = {
        "cache_dir": config.cache_dir,
        "use_fast": config.use_fast_tokenizer,
        "revision": config.model_revision,
        "use_auth_token": True if config.use_auth_token else None,
    }
    if config.model_dir:
        tokenizer = AutoTokenizer.from_pretrained(
            config.model_dir, **tokenizer_kwargs, trust_remote_code=True
        )
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )
    # * resize embedding
    if tokenizer.pad_token is None:
        print(f"Adding pad token {DEFAULT_PAD_TOKEN}")
        tokenizer.add_special_tokens(dict(pad_token=DEFAULT_PAD_TOKEN))
    logger.info(f"len(tokenizer):{len(tokenizer)}")
    if dist.is_initialized():
        dist.barrier()
    return tokenizer

def load_dataset(tokenizer: PreTrainedTokenizer) -> tuple:
    # * Load training data, development data and test data
    train_dataset = TextDataset.from_file(
        config.train_file_path,
        tokenizer,
        split=Split.TRAINING,
        configs=config,
        load_data_fn=load_data_fn,
    )
    try:
        val_dataset = TextDataset.from_file(
            config.val_file_path,
            tokenizer,
            split=Split.VALIDATION,
            configs=config,
            load_data_fn=load_data_fn,
        )
    except TypeError as e:
        if local_rank == 0:
            logger.warning(e)
        val_dataset = None
    try:
        test_dataset = TextDataset.from_file(
            config.test_file_path,
            tokenizer,
            split=Split.TEST,
            configs=config,
            load_data_fn=load_data_fn,
        )
    except TypeError as e:
        if local_rank == 0:
            logger.warning(e)
        test_dataset = None
    if dist.is_initialized():
        dist.barrier()
    return train_dataset, val_dataset, test_dataset

def load_model(tokenizer) -> deepspeed.DeepSpeedEngine:
    start = time.time()
    # * Load model config
    model_kwargs = {
        "cache_dir": config.cache_dir,
        "revision": config.model_revision,
        "use_auth_token": True if config.use_auth_token else None,
    }
    if config.model_dir:
        model_config = AutoConfig.from_pretrained(
            config.model_dir, **model_kwargs, trust_remote_code=True
        )
    else:
        model_config = CONFIG_MAPPING[config.model_type]()
        logger.warning("You are instantiating a new config instance from scratch.")
        if config.config_overrides is not None:
            logger.info(f"Overriding config: {config.config_overrides}")
            model_config.update_from_string(config.config_overrides)
            logger.info(f"New config: {config}")
    # * Load model
    logger.debug(f"local_rank {local_rank}: Loading model ...")
    if config.model_dir:
        torch_dtype = (
            config.torch_dtype
            if config.torch_dtype in ["auto", None]
            else getattr(torch, config.torch_dtype)
        )
        model = AutoModelForCausalLM.from_pretrained(
            config.model_dir,
            from_tf=bool(".ckpt" in config.model_dir),
            config=model_config,
            cache_dir=config.cache_dir,
            revision=config.model_revision,
            use_auth_token=True if config.use_auth_token else None,
            torch_dtype=torch_dtype,
            low_cpu_mem_usage=False,
            trust_remote_code=True,
        )
    else:
        model = AutoModelForCausalLM.from_config(config)
        n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
        logger.info(
            f"Training new model from scratch - Total size={n_params/2**20:.2f}M params"
        )
    embedding_size = model.get_input_embeddings().weight.shape[0]
    if len(tokenizer) != embedding_size:
        logger.info("resize the embedding size by the size of the tokenizer")
        model.resize_token_embeddings(len(tokenizer))
    ds_model=model
    # if config.parallel_mode == "deepspeed":
    #     deepspeed_config = hjson.load(open(config.deepspeed_config, "r"))
    #     config.set_deepspeed(deepspeed_config)
    #     ds_model, _, _, _ = deepspeed.initialize(model=model, config=deepspeed_config)
    end = time.time()
    logger.debug(f"local_rank {local_rank}: Loading model takes {end - start:.2f} sec.")
    return ds_model

# * Loading tokenizer
tokenizer = load_tokenizer()

# * load dataset
dataset_train, val_dataset, test_dataset = load_dataset(tokenizer)

# *load model
model = load_model(tokenizer)

dataloader_train, sampler = get_dataloader(
            dataset_train, config, Split.TRAINING, collate_fn=dataset_train.collate_fn, shuffle=config.shuffle
        )

In [None]:
for batch in dataloader_train:
    output = model(**batch, max_new_tokens=20)
    break

# dataset

In [1]:
from transformers import AutoTokenizer
from build_dataset import MyDataset
import os
from pathlib import Path
tokenizer =  AutoTokenizer.from_pretrained("pretrained_models/baichuan2-13b-chat", trust_remote_code=True)
path = Path("data/hot_finetune_data/")
files = [os.path.join(path,file.name) for file in path.glob("*.json")]
dataset = MyDataset(files, tokenizer, 2048)

  EN_PART_RE = re.compile("[\s\u0021-\u007f]+")


In [2]:
len(dataset)

150

In [6]:
from toolkit.training import get_dataloader
from toolkit.enums import Split
from toolkit.nlp import NLPTrainingConfig

In [13]:
dataloader, _ = get_dataloader(dataset, NLPTrainingConfig(train_batch_size=8), Split.TRAINING, collate_fn=dataset.collate_fn)

In [14]:
for batch in dataloader:
    break

# deepspeed

In [3]:
from deepspeed import DeepSpeedConfig
ds_config = {
    "fp16": {
        "enabled": False
    },
    "bf16": {
        "enabled": False
    },
    "train_batch_size":2
}
config = DeepSpeedConfig(ds_config)

# construct dataset

In [2]:
import pandas

df = pandas.read_json('data/hot_finetune_data/train_v6.json', lines=True)


In [5]:
dev = df.sample(100, replace=False)
train = df.drop(dev.index)

In [6]:
dev.info()
train.info()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 127 to 5129
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   instruction  100 non-null    object
 1   input        100 non-null    object
 2   output       100 non-null    object
 3   query        71 non-null     object
dtypes: object(4)
memory usage: 3.9+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 5962 entries, 0 to 6061
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   instruction  5962 non-null   object
 1   input        5962 non-null   object
 2   output       5962 non-null   object
 3   query        3890 non-null   object
dtypes: object(4)
memory usage: 232.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6062 entries, 0 to 6061
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   instruction  6062 non-null   o

In [11]:
d = 'data/hot_finetune_data'
from pathlib import Path

d = Path(d)

train_dir = d/'train'
dev_dir = d/'dev'

train_dir.mkdir(exist_ok=True)
dev_dir.mkdir(exist_ok=True)

train.to_json(train_dir/"all.json", orient="records", lines=True, force_ascii=False)
dev.to_json(dev_dir/"all.json", orient="records", lines=True, force_ascii=False)

In [7]:
from transformers import AutoConfig
import torch

config = AutoConfig.from_pretrained("./pretrained_models/baichuan2-13b-chat/", trust_remote_code=True)
print(config.vocab_size)

torch_dtype = torch.float16
from_pretrained_kwargs = dict(
    from_tf=False,
    cache_dir=None,
    revision="main",
    use_auth_token=True,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=False,
    trust_remote_code=True,
)

125696


In [1]:
from toolkit.metric import rouge

pred = ["你好吗", "你多大", "abcdefg"]
tgt=["好吗", "你", "bfg"]

rouge(pred, tgt, ('rougeL', 'rouge2'), 'zh')

  _PYTHON_LOWER_3_8 = LooseVersion(_PYTHON_VERSION) < LooseVersion("3.8")
  """


  0%|          | 0/3 [00:00<?, ?it/s]

{'rougeL': 0.6333333452542623, 'rouge2': 0.3055555621782939}

In [1]:
%load_ext autoreload
%autoreload 2

In [13]:
from toolkit.metric import distinct_n_corpus_level
sentences = [
            'the cat sat on the mat'.split(),
            'mat the on sat cat the'.split(),
            'i do not know'.split(),
            'Sorry but i do not know'.split(),
        ]
sentences = [
            list('徐怀钰近期参加综艺节目《乘风》复出，享受舞台找回自我。\n徐怀钰的经历充满波折，家庭贫困、爷爷的心理问题，这些年一直在努力付出。虽然在《乘风》的舞台上复出效果不理想，但我们也看到了徐怀钰逐渐找回自我，这比比赛的输赢更重要。在《乘风》中，她与秋瓷炫、黄丽玲等人成为了好友，并携手合作。虽然她没有王心凌的机遇，但《乘风》也为她提供了一个重要的机会。她因一公被抨击而感到难过，好在有Ella等人的安慰与支持，让她挺过了难关。五公时，慢热的徐怀钰终于找到了感觉，秋瓷炫也为她的变化感到高兴，看出她开始享受舞台了。这次，徐怀钰主动和黄丽玲合作，无疑将带来许多火花。\n作为徐怀钰复出的舞台，《乘风》让徐怀钰找回终于找回了自我。让我们一起期待徐怀钰更多精彩表现！'),
            list('上周全国食用农产品和生产资料价格小幅上涨，其中食用农产品市场价格上涨1.6%，生产资料市场价格上涨0.1%。\n粮油批发价格略有波动，大米、豆油和菜籽油价格下降，花生油价格上涨，面粉价格持平。30种蔬菜平均批发价格上涨0.4%，其中黄瓜、生菜和菠菜价格大幅上涨。6种水果平均批发价格小幅下降，西瓜、葡萄和梨价格下降最多。肉类价格小幅波动，牛肉和羊肉价格下降，猪肉价格上涨。禽产品价格小幅上涨，鸡蛋和白条鸡价格上涨。水产品批发价格以降为主，鲤鱼、鲢鱼和草鱼价格下降。\n价格的小幅上涨对我们的生活会带来一定的影响。你会因为上涨的食用农产品价格而改变自己的购买选择吗？还是会选择继续购买自己喜欢的食物？留言和我分享你的想法！\n')
        ]
print(distinct_n_corpus_level(sentences, 1))
print(distinct_n_corpus_level(sentences, 2))

0.44361116955724544
0.7455188513831694


In [105]:
from torchmetrics.functional.text import bleu_score
preds = ['the cat is on the mat']
target = ['there is a cat on the mat']
print(bleu_score(preds, target, 4))

preds = ['the cat is on the mat']
target = ['a cat is on the mat']
bleu_score(preds, target, 4)

tensor(0.)


tensor(0.7598)

In [125]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
preds = 'the cat is on the mat'.split()
target = ['there is a cat on the mat'.split()]
chencherry = SmoothingFunction()
sentence_bleu(target, preds, [(1.,), (1./2., 1./2.), (1./3., 1./3., 1./3.), (1./4., 1./4., 1./4., 1./4.)],smoothing_function=chencherry.method1)

[0.7054014374088451,
 0.4887164517296948,
 0.36973494931036327,
 0.19433094436376075]

In [126]:
preds = 'the cat is on the mat'.split()
target = ['a cat is on the mat'.split()]
chencherry = SmoothingFunction()
sentence_bleu(target, preds, [(1.,), (1./2., 1./2.), (1./3., 1./3., 1./3.), (1./4., 1./4., 1./4., 1./4.)],smoothing_function=chencherry.method1)

[0.8333333333333334, 0.816496580927726, 0.7937005259840998, 0.7598356856515925]

In [127]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
preds = ['the cat is on the mat'.split(), 'If there is no ngrams overlap for any order of n-grams'.split()]
target = [['a cat is on the mat'.split()], ['for any order of n-grams'.split()]]
chencherry = SmoothingFunction()
corpus_bleu(target, preds, [(1.,), (1./2., 1./2.), (1./3., 1./3., 1./3.), (1./4., 1./4., 1./4., 1./4.)],smoothing_function=chencherry.method1)

[0.5882352941176471,
 0.5601120336112039,
 0.5251127559361087,
 0.47902287469880817]

In [128]:
corpus_bleu(target, preds,)

0.47902287469880817

In [148]:
from toolkit.metric import self_bleu_one_set

print(self_bleu_one_set(['the cat is on the mat', 'a cat is on the mat', 'there is a cat on the mat']))
print(self_bleu_one_set(['the cat is on the mat', 'a dog is running', 'birds can fly in sky freely']))

0.24430698313673305
0.9138773566669056


In [149]:
from toolkit.metric import self_bleu_one_set

print(self_bleu_one_set(['the cat is on the mat', 'a cat is on the mat', 'there is a cat on the mat'], weights=[(1.,), (1./2., 1./2.), (1./3., 1./3., 1./3.), (1./4., 1./4., 1./4., 1./4.)]))
print(self_bleu_one_set(['the cat is on the mat', 'a dog is running', 'birds can fly in sky freely'], weights=[(1.,), (1./2., 1./2.), (1./3., 1./3., 1./3.), (1./4., 1./4., 1./4., 1./4.)]))

[0.16604713 0.16716345 0.20166345 0.24430698]
[0.56541006 0.73415542 0.85089009 0.91387736]


In [152]:
from toolkit.metric import self_bleu

self_bleu([['the cat is on the mat', 'a cat is on the mat', 'there is a cat on the mat'], ['the cat is on the mat', 'a dog is running', 'birds can fly in sky freely']])

0.5790921699018193

In [153]:
from toolkit.metric import self_bleu

self_bleu([['the cat is on the mat', 'a cat is on the mat', 'there is a cat on the mat'], ['the cat is on the mat', 'a dog is running', 'birds can fly in sky freely']], weights=[(1.,), (1./2., 1./2.), (1./3., 1./3., 1./3.), (1./4., 1./4., 1./4., 1./4.)])


array([0.3657286 , 0.45065944, 0.52627677, 0.57909217])