In [1]:
from typing import Tuple
import os
import sys
import torch
import time
import json

from accelerate import Accelerator
from transformers import T5Tokenizer, T5ForConditionalGeneration

from fairscale.nn.model_parallel.initialize import initialize_model_parallel
from llama import ModelArgs, Transformer, Tokenizer, LLaMA

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
model_name = "google/flan-t5-xxl"

accelerator = Accelerator()
device = accelerator.device
tokenizer = accelerator.prepare(
    T5Tokenizer.from_pretrained(model_name)
)

model = T5ForConditionalGeneration.from_pretrained(model_name, device_map="auto",  torch_dtype=torch.bfloat16,  cache_dir="../models")
model = model.to(device)

In [2]:
def setup_model_parallel() -> Tuple[int, int]:
    local_rank = int(os.environ.get("LOCAL_RANK", -1))
    world_size = int(os.environ.get("WORLD_SIZE", -1))

    torch.distributed.init_process_group("nccl")
    initialize_model_parallel(world_size)
    torch.cuda.set_device(local_rank)

    # seed must be the same in all processes
    torch.manual_seed(1)
    return local_rank, world_size

def load(
    ckpt_dir: str,
    tokenizer_path: str,
    local_rank: int,
    world_size: int,
    max_seq_len: int,
    max_batch_size: int,
) -> LLaMA:
    start_time = time.time()
    checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
    assert world_size == len(
        checkpoints
    ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {world_size}"
    ckpt_path = checkpoints[local_rank]
    print("Loading")
    checkpoint = torch.load(ckpt_path, map_location="cpu")
    with open(Path(ckpt_dir) / "params.json", "r") as f:
        params = json.loads(f.read())

    model_args: ModelArgs = ModelArgs(
        max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params
    )
    tokenizer = Tokenizer(model_path=tokenizer_path)
    model_args.vocab_size = tokenizer.n_words
    torch.set_default_tensor_type(torch.cuda.HalfTensor)
    model = Transformer(model_args)
    torch.set_default_tensor_type(torch.FloatTensor)
    model.load_state_dict(checkpoint, strict=False)

    generator = LLaMA(model, tokenizer)
    print(f"Loaded in {time.time() - start_time:.2f} seconds")
    return generator

In [3]:
prompts = ["Answer with only one number from 1 to 7, considering 1 as the least similar and 7 as the most similar: How similar is Alligator and Alligator?",
           "Answer with only one number from 1 to 7, considering 1 as the least similar and 7 as the most similar: How similar is Alligator and Cat?",
           "Answer with only one number from 1 to 7, considering 1 as the least similar and 7 as the most similar: How similar is Dog and Cat?"]

In [4]:
model_size = '7B'
target_folder = '/mnt/disk-1/llama'

ckpt_dir = f"{target_folder}/{model_size}"
tokenizer_path = f"{target_folder}/tokenizer.model"

local_rank, world_size = setup_model_parallel()

if local_rank > 0:
    sys.stdout = open(os.devnull, "w")

generator = load(
    ckpt_dir, tokenizer_path, local_rank, world_size, max_seq_len, max_batch_size
)
results = generator.generate(
    prompts, max_gen_len=256, temperature=temperature, top_p=top_p
)
results

ValueError: Error initializing torch.distributed using env:// rendezvous: environment variable RANK expected, but not set

In [None]:
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
outputs = model.generate(input_ids, num_return_sequences=5, num_beams= 25, temperature = 0.5)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))