In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
import torch
import time

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

# Specify an offload folder for weights offloaded to disk
offload_folder = "./offload_weights"

# Start timing the loading process
start_time = time.time()

# Load the model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    "NECOUDBFM/Jellyfish",
    torch_dtype=torch.float16,
    device_map="auto",
    offload_folder=offload_folder,
)
tokenizer = AutoTokenizer.from_pretrained("NECOUDBFM/Jellyfish")

# End timing the loading process
end_time = time.time()
print(f"Model and tokenizer loaded in {end_time - start_time:.2f} seconds.")

system_message = "You are an AI assistant that follows instruction extremely well. Help as much as you can."

# Define the user_message variable
user_message = """You are tasked with determining whether two records listed below are the same based on the information provided.
Carefully compare the Company, Location, Industry, Description, Name, Education, Position, Skills for each record before making your decision.
Note: Missing values (N/A or \"nan\") should not be used as a basis for your decision.
Record A: [Company: CreativeWorks, Location: Seattle, WA, Industry: Design, Description: Creative agency offering graphic design and branding services.]
Record B: [Name: Chris Wilson, Education: B.F.A. in Graphic Design, RISD, Position: Senior Graphic Designer, Skills: ['Adobe Creative Suite', 'Branding'], Role: Graphic Designer, Company: CreativeWorks]
Are record A and record B the same entity? Choose your answer from: [Yes, No]."""

prompt = f"{system_message}\n\n### Instruction:\n\n{user_message}\n\n### Response:\n\n"
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs["input_ids"].to(device)

# Define the generation configuration
generation_config = GenerationConfig(
    do_sample=True,
    temperature=0.35,
    top_p=0.9,
)

# Generate the response
with torch.no_grad():
    generation_output = model.generate(
        input_ids=input_ids,
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=1024,
        pad_token_id=tokenizer.eos_token_id,
        repetition_penalty=1.15,
    )

output = generation_output.sequences
response = tokenizer.decode(
    output[:, input_ids.shape[-1]:][0], skip_special_tokens=True
).strip()

print(response)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some parameters are on the meta device device because they were offloaded to the disk.


Model and tokenizer loaded in 67.59 seconds.
No
