In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Clear GPU cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()

model_name = "meta-llama/Llama-3.1-70B-Instruct"  # Replace with exact path for 8B model
tokenizer = AutoTokenizer.from_pretrained(model_name)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Check if CUDA is available
device = 0 if torch.cuda.is_available() else -1  # Use GPU if available, otherwise fallback to CPU

# Load the model with the appropriate device settings
model = AutoModelForCausalLM.from_pretrained(
    model_name,                   
    return_dict=True,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    device_map="auto" if device == 0 else None,  # Automatically distribute across devices or use CPU
    trust_remote_code=True
)


Loading checkpoint shards: 100%|██████████| 30/30 [03:53<00:00,  7.78s/it]
Some parameters are on the meta device because they were offloaded to the cpu and disk.


ValueError: You cannot use both `pipeline(... torch_dtype=..., model_kwargs={"torch_dtype":...})` as those arguments might conflict, use only one.)

In [5]:
# Initialize the text generation pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    model_kwargs={
        "torch_dtype": torch.bfloat16,
        "quantization_config": {"load_in_4bit": True}
    }, 
    device_map="auto" if device == 0 else None,  # Explicitly set device to GPU if available, else CPU
)

# # Example usage of the pipeline
# output = pipe("Once upon a time,", max_length=50)
# print(output)

In [6]:
# For model_0
for name, param in model.named_parameters():
    print(f"{name} is on {param.device}")


model.embed_tokens.weight is on cuda:0
model.layers.0.self_attn.q_proj.weight is on cuda:0
model.layers.0.self_attn.k_proj.weight is on cuda:0
model.layers.0.self_attn.v_proj.weight is on cuda:0
model.layers.0.self_attn.o_proj.weight is on cuda:0
model.layers.0.mlp.gate_proj.weight is on cuda:0
model.layers.0.mlp.up_proj.weight is on cuda:0
model.layers.0.mlp.down_proj.weight is on cuda:0
model.layers.0.input_layernorm.weight is on cuda:0
model.layers.0.post_attention_layernorm.weight is on cuda:0
model.layers.1.self_attn.q_proj.weight is on cuda:0
model.layers.1.self_attn.k_proj.weight is on cuda:0
model.layers.1.self_attn.v_proj.weight is on cuda:0
model.layers.1.self_attn.o_proj.weight is on cuda:0
model.layers.1.mlp.gate_proj.weight is on cuda:0
model.layers.1.mlp.up_proj.weight is on cuda:0
model.layers.1.mlp.down_proj.weight is on cuda:0
model.layers.1.input_layernorm.weight is on cuda:0
model.layers.1.post_attention_layernorm.weight is on cuda:0
model.layers.2.self_attn.q_proj.w

In [7]:
import pandas as pd
import numpy as np
answer_df = pd.read_csv("../../llama_data/answer_df_part1.csv")
query_df = pd.read_csv("../../llama_data/query_df3.csv")
print(answer_df.shape)

(65140, 5)


In [8]:
from time import time



system_message = """
You are an AI assistant designed to answer questions.
Please restrict your answer to the exact question and use the exact answer format asked.
Answer should not have implied information. If the answer is yes, always provide related phrases. 
"""


def colorize_text(text):
    for word, color in zip(["Reasoning", "Question", "Answer", "Total time"], ["blue", "red", "green", "magenta"]):
        text = text.replace(f"{word}:", f"\n\n**<font color='{color}'>{word}:</font>**")
    return text

def format_prompt(text):
    question_content = "Does the paragraph mention any of the following topics:\n"
    for i in range(len(query_df)):
        question_content += f"  ({i+1}) {query_df.topic[i]}: {query_df.description[i]}.\n"
    answer_content = "Return answer in format:\n"
    for i in range(len(query_df)): 
        answer_content += f"  ({i+1}) {query_df.topic[i]}: [yes/no], related phrases if any: \n"
    paragragh_content = f"Paragraph: '{text}' \n"
    user_message = question_content + answer_content + paragragh_content
    #print(user_message)
    
    return user_message



def query_model_batch(
        system_message,
        user_messages,
        temperature=0,
        max_length=1024
    ):
    start_time = time()
    # Add "Question: ... Answer:" to each user message for clarity
    batched_messages = [
        "Question: " + message + " Answer:" for message in user_messages
    ]
    
    # Construct prompts for each message in batch
    all_prompts = [
        pipe.tokenizer.apply_chat_template(
            [
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_message}
            ],
            tokenize=False,
            add_generation_prompt=True
        ) for user_message in batched_messages
    ]
    
    # Define the end-of-sequence terminators
    terminators = [
        pipe.tokenizer.eos_token_id,
        pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    
    # Run the batch inference
    sequences = pipe(
        all_prompts,
        do_sample=True,
        top_p=0.5,
        temperature=temperature,
        num_return_sequences=1,
        eos_token_id=terminators,
        max_new_tokens=max_length,
        return_full_text=False,
        pad_token_id=terminators[0]
    )
    
    # Extract generated text for each sequence
    answers = []
    for i, sequence in enumerate(sequences):
        answer = sequence[0]['generated_text']
        total_time = f"Total time: {round(time() - start_time, 2)} sec."
        # Format the response with timing information
        answers.append(batched_messages[i] + " " + answer + " " + total_time)

    return answers

In [9]:
# test example

text = "i have always wondered how people with ana make it when theyre eating very few calorie s. do their organs not shut down. i suffered with an eating disorder for a few year s and i am recovered now but i never struggled with severe anorexia. does the body only begin to cannibalize the organs after all of their fat storage is gone. what is the process."


prompts = [format_prompt(text)]
responses = query_model_batch(
        system_message=system_message,
        user_messages=prompts,
        temperature=0.5,
        max_length=512
    )


# Display and process responses in a loop
for i, response in enumerate(responses):
    print(colorize_text(f"{response}"))



Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.96 GiB. GPU 0 has a total capacity of 10.75 GiB of which 1.27 GiB is free. Process 3684134 has 228.00 MiB memory in use. Process 349965 has 246.00 MiB memory in use. Including non-PyTorch memory, this process has 8.99 GiB memory in use. Of the allocated memory 8.78 GiB is allocated by PyTorch, and 30.34 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)