In [2]:
from IPython import get_ipython
%load_ext autoreload
%autoreload 2

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
from transformers import GPTJForCausalLM

In [4]:
from deepspeed.pipe import PipelineModule, LayerSpec, TiedLayerSpec

In [5]:
import sys

In [6]:
from transformers.models.gptj.modeling_gptj import GPTJBlock

In [7]:
import torch.nn as nn

In [8]:
class EmbeddingPipe(nn.Module):
    """Extends Embedding to forward attention_mask through the pipeline."""

    @property
    def word_embeddings_weight(self):
        """Easy accessory for the pipeline engine to tie embeddings across stages."""
        return self.weight

    def forward(self, args):
        assert (
            len(args) == 3
        ), f"Expected 3 arguments (input_ids, position_ids, attention_mask), but got {len(args)}."

        input_ids = args[0]
        position_ids = args[1]
        attention_mask = args[2]
        embeddings = super().forward(input_ids, position_ids)
        return embeddings, attention_mask

In [9]:
class TransformerLayerPipe(GPTJBlock):
    """Extends ParallelTransformerLayer to forward attention_mask through the pipeline."""

    def forward(self, args):
        assert (
            len(args) == 2
        ), "ParallelTransformerLayerPipe expects 2 arguments - hidden_states and attention_mask"
        hidden_states, attention_mask = args
        # we are returning just [hidden_states, mask]
        return super().forward(hidden_states, attention_mask), attention_mask

In [None]:
from megatron.mpu import ParallelRelativePositionBias
from megatron.model.transformer import (
    ParallelTransformerLayerPipe, # √
    NormPipe,
    ParallelLinearPipe,
    parallel_lm_logits,
    ParallelLinear, # √
)
from megatron.model.gmlp import GMLPBlock
from megatron.model.word_embeddings import EmbeddingPipe, SoftEmbedding

In [10]:
import sys

In [11]:
sys.path.append('/nas/shawn_guo/gpt-neox/')

In [12]:
from gpt2_model_gptj import GPT2ModelPipe

In [None]:
# from deepspeed.comm.comm import init_distributed
# init_distributed(dist_backend="nccl",
#                      auto_mpi_discovery=True,
#                     distributed_port=TORCH_DISTRIBUTED_DEFAULT_PORT,
#                     verbose=True,
#                     timeout=default_pg_timeout,
#                      init_method=None,
#                     dist_init_required=None,
#                      config=None):

In [13]:
from dataclasses import dataclass

In [12]:
attention_config = ['global']*28

In [14]:
@dataclass
class GPTJArgs:
    hidden_size = 4096
    checkpoint_num_layers = 1
    num_layers = 28
    attention_config = ['global']*28
    checkpoint_activations = False
    pipe_partition_method = 'parameters'  # default
    no_weight_tying = False
    padded_vocab_size = 50400
    max_position_embeddings = 2048
    hidden_dropout = 0.0 
    pos_emb = "rotary" # gxh
    num_attention_heads = 16
    fp16_lm_cross_entropy = False # gxh
    is_pipe_parallel = True
    init_method = "normal"
    output_layer_init_method = "normal"
    init_method_std = "normal"
    norm = "layernorm"
    layernorm_epsilon = 1e-4

In [15]:
gptj_args = GPTJArgs()

In [16]:
from megatron.utils import print_rank_0
from megatron import mpu

In [17]:
def get_model(gptj_args, use_cache=False):
    """Build the model."""

    print_rank_0("building GPT2 model ...")

    # Build model on cpu.
    model = GPT2ModelPipe(
        neox_args=gptj_args,
        num_tokentypes=0,
        parallel_output=True,
        topology=mpu.get_topology(),
        use_cache=use_cache,
    )

    ### soft prompt tuning stuff ###
    if neox_args.soft_prompt_tuning is not None and neox_args.soft_prompt_tuning.get(
        "enabled", False
    ):
        soft_prompt = SoftEmbedding(
            neox_args,
            wte=getattr(model, "0").word_embeddings,
            n_tokens=neox_args.soft_prompt_tuning.get("n_tokens", 10),
            init_string=neox_args.soft_prompt_tuning.get("init_string", ""),
            init_range=neox_args.soft_prompt_tuning.get("init_range", 0.5),
        )
        model.insert_layers(
            layers=soft_prompt, idx=1
        )  # insert the soft prompt layer directly after the word embeddings

        # freeze everything but the soft prompt
        for name, param in model.named_parameters():
            if not "soft_embedding" in name:
                param.requires_grad = False

    if neox_args.is_pipe_parallel:
        # Export PipeParallel model to nn.Sequential model to avoid the overhead of deepspeed's pipe parallel training
        model = model.to_sequential()

    if neox_args.deepspeed:
        # DeepSpeed handles CUDA, FP16, and DDP components.
        return model
    else:
        raise ValueError("Must be using deepspeed to run neox")

In [27]:
model = get_model(gptj_args)

building GPT2 model ...
name normal
name normal
SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None


RuntimeError: num_stages (4) must divide distributed world size (1)

In [17]:
mpu.get_topology()

In [19]:
import deepspeed

In [20]:
help(deepspeed.runtime.pipe.topology)

Help on module deepspeed.runtime.pipe.topology in deepspeed.runtime.pipe:

NAME
    deepspeed.runtime.pipe.topology - # Copyright 2019 The Microsoft DeepSpeed Team

CLASSES
    builtins.object
        PipelineParallelGrid
        ProcessTopology
            PipeDataParallelTopology
            PipeModelDataParallelTopology
    
    class PipeDataParallelTopology(ProcessTopology)
     |  PipeDataParallelTopology(num_pp, num_dp)
     |  
     |  A topology specialization for hybrid data and pipeline parallelism.
     |  
     |  Uses data parallelism on the last dimension to encourage gradient
     |  reductions to use high-bandwidth intra-node links and lower-volume
     |  pipeline communications to use low-bandwidth inter-node links.
     |  
     |  Method resolution order:
     |      PipeDataParallelTopology
     |      ProcessTopology
     |      builtins.object
     |  
     |  Methods defined here:
     |  
     |  __init__(self, num_pp, num_dp)
     |      Create a mapping of n

In [24]:
from deepspeed.runtime.pipe.topology import PipelineParallelGrid

In [18]:
from deepspeed import comm as dist

In [26]:
dist.get_rank()

0

In [25]:
if dist.init_distributed():
    print('yes')

[2022-09-08 12:55:50,832] [INFO] [comm.py:618:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment...
[2022-09-08 12:55:50,950] [INFO] [comm.py:675:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=192.168.53.6, master_port=29500
[2022-09-08 12:55:50,952] [INFO] [comm.py:635:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl


In [31]:
dist.get_world_size()

1

In [32]:
file = '/nas/shawn_guo/dataset/text_paragraph/data.csv'

In [33]:
import pandas as pd

In [34]:
df = pd.read_csv(file)

In [38]:
df[10:300]

Unnamed: 0.1,Unnamed: 0,type,name,content,dispatch
10,10,adjacent,第一狂妃：绝色邪王宠妻无度,[EOP]\n***********\n华曦回到落凤居，等在房间里的铃儿便焦急地说：“小姐您...,郭新浩
11,11,adjacent,第一狂妃：绝色邪王宠妻无度,[EOP]\n夜风萧索，冷月孤魂。\n那个被无数少女视为梦中情人的太子殿下，头戴珠花，耳朵上...,郭新浩
12,12,adjacent,第一狂妃：绝色邪王宠妻无度,[EOP]\n******仙逆天下******\n大清早，华曦打了个呵欠舒舒服服起床，吃了早...,郭新浩
13,13,adjacent,第一狂妃：绝色邪王宠妻无度,[EOP]\n125\n125.第125章魔族初现【1】\n而是……\n“丑八怪！”他大骂了...,郭新浩
14,14,adjacent,重生隐婚：Hi，高冷权少！,[EOP]\n订婚宴是在萧家举行的，所以此时下方的大厅里宾客络绎不绝，优美的华尔兹音乐在空中...,郭新浩
...,...,...,...,...,...
295,295,random,你是我戒不掉的心动,[EOP]\n夜斯在经过许欢颜身边的时候，才注意到，他今天穿的是一身白色的家居服。\n特别的...,郭新浩
296,296,random,最强弃少,[EOP]\n因为叶默的电话用完之后，都是放在储物戒指里面，所以宁轻雪和郁妙彤的决定，叶默一...,郭新浩
297,297,random,如意小郎君,[EOP]\n……\n皇宫某殿，王丞相拱了拱手，说道：“陛下，结盟一事，老臣回去之后，再和诸...,郭新浩
298,298,random,完美人生,[EOP]\n被咖啡泡入喧嚣的亭院，\n异族在日坛膜拜古人月亮，\n开元盛世令人神往！\n风...,郭新浩


In [39]:
file_ = '/nas/shawn_guo/dataset/novel_text_RL/text_1300.csv'

In [40]:
df1_ = pd.read_csv(file_)

In [41]:
df1_

Unnamed: 0.1,Unnamed: 0,text
0,0,狂龙团队剩下的六个人脱离了战斗之后。\n在圣殿之火众多玩家的保护下，读取了回城，化作六道白光...
1,1,萧翎不再准备冒险靠近那处冰棺了，操纵着罗格之眼飞了起来，罗格之眼在众多高耸地建筑中穿梭着，飞...
2,2,。\n-----------------分頁-----------------\n第三八六章...
3,3,这种战斗萧翎是不可能召唤泰坦巨人恩特的。\n泰坦巨人恩特地秘密不能曝露，这是萧翎的底牌。\n...
4,4,一行人一路清扫着怪物，朝侍僧僧院方向行进。\n侍僧僧院是一座非常宏伟的建筑。\n坐落于阿卡扎...
...,...,...
1295,1295,正在欣赏美景找美女，突兀的出现一个打扮的如同蛤蟆一般的绿衣汉子就活生生的把这里绝美的景致给毁...
1296,1296,碧落宗的二十多名弟子，将八方宗十名弟子前后夹攻。\n“呵，跑啊，你们倒是继续跑啊！”\n“你...
1297,1297,夜下的涅瓦德格外迷人。\n旅店露台的栏杆上，月光穿过树梢，银晃晃一片。\n远处是波光粼粼的湖...
1298,1298,巍峨黑色血山之巅，此时一座邪魔宫殿上，以各式各样生命俘虏奴隶作为献祭品，正有一个邪魔施展着次...


In [42]:
df_save = df.drop(columns=['type', 'name', 'dispatch'])

In [51]:
df_save_ = df_save['content'].apply(lambda x: x[6:-6])

In [59]:
df_save.iloc[1]['content']

'[EOP]\n夏天夏天要上山（1）\n知了！知了！知了！知了！\n被热得半死的蝉们趴在树干上有气无力地哀鸣着，灼夏的阳光炙烤着它们的身体，同样也炙烤着大地。\n因为暑假的到来湖大校园显得空旷了许多，然而痴汉攻和温油受不打算回家，以实习/写论文为理由（借口）在这个盛夏也依偎在一起生活。\n然而实在是……太他妈热了。\n“我的几个朋友计划要上山去玩两天，我们也去吧！”痴汉攻如是说。\n温油受刚听到时眼前一亮，随即又默默忧愁了：“你那几个朋友还不认识我吧？我去的话会不会显得很突兀打扰气氛啊？”\n“怎么可能啦！一起玩一玩就认识了啊~”痴汉攻笑了笑，又说道：“我才要做心理建设啦。你这么可爱，难保要遭人惦记，我犹豫了很久要不要去呢。不过我想了想夏天真的是很热，山上很凉快你一定会喜欢啦。所以啊，你不要辜负我的信任，一定一定要乖乖的哦！”\n大概只有痴汉攻这种眼瞎的人才会对如此平凡的自己紧张兮兮的吧。\n这家伙一定不知道自己到底在烦恼什么。\n啊，蠢货的人生真是单纯啊~\n痴汉攻温柔地握住温油受的手，轻声在他耳边说：“不要怕。”\n温油受有种小心思被窥见的羞耻，痴汉攻觉得他这模样实在是可爱，便加了句：“丑媳妇儿，就是见公婆也有老公在啊。不怕不怕~”\n（你讨厌啦谁丑啊！不对不对谁是丑媳妇儿啊！）\n[EOP]'

In [64]:
df_save_.columns = ['text']

In [71]:
dd = pd.DataFrame(df_save_)

In [73]:
dd.columns = ['text']