In [1]:
import pandas as pd
import os
from tqdm import tqdm
import transformers
import torch
from transformers import AutoTokenizer, LlamaForCausalLM
from huggingface_hub import interpreter_login
from huggingface_hub import login

[2024-04-26 12:44:04,021] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/longju/.cache/huggingface/token
Login successful


In [42]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

messages = [
    {"role": "system", "content": "You are AI assistant designed for in-context learning, you can learn from the few shot examples to generated similar contents"},
    {"role": "user", "content": "Who are you?"},
]

prompt = pipeline.tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
)

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = pipeline(
    prompt,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
print(outputs[0]["generated_text"][len(prompt):])


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


I am an AI assistant designed for in-context learning. I can learn from a few shot examples and generate similar contents. This means that if you provide me with a few examples of a specific topic, style, or tone, I can learn from those examples and generate new content that is similar in nature. I'm here to help you with your queries, provide information, and even assist you in generating content. So, what would you like to know or create?


In [43]:
def get_completion(user_prompt):
    
    input_message = user_prompt

    messages = [
    {"role": "system", "content": "You are AI assistant designed for in-context learning, you can learn from the few shot examples to generated similar contents"},
    {"role": "user", "content": f"{input_message}"},
    ]

    prompt = pipeline.tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
    )

    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = pipeline(
        prompt,
        max_new_tokens=256,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )

    return outputs[0]["generated_text"][len(prompt):]

# Example usage
prompt = "Hey how are you doing today?"
responses = get_completion(prompt)
print(responses)


# from openai import OpenAI
# client = OpenAI()

# completion = client.chat.completions.create(
#   model="gpt-4-turbo-2024-04-09",
#   messages=[
#     {"role": "system", "content": "You are a helpful assistant."},
#     {"role": "user", "content": "Hello!"}
#   ]
# )

# print(completion.choices[0].message)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


I'm doing well, thanks for asking! I'm a large language model, so I don't have emotions like humans do, but I'm always happy to chat with you and help with any questions or tasks you have. How about you? How's your day going?


In [44]:
def load_examples(file_path):
    with open(file_path, 'r') as file:
        examples = file.read().strip()
    return examples

In [45]:
def generate_data(csv_input, csv_output, examples_file):
    # Load the examples for few-shot learning
    examples = load_examples(examples_file)
    # Load the CSV file
    goldstandard = pd.read_csv(csv_input)


    # ------------------------------test
    goldstandard = goldstandard.head(5) 

    
    # Initialize the output DataFrame
    generated_data = []

    # Process each row in the DataFrame
    for idx, row in tqdm(goldstandard.iterrows(), total=goldstandard.shape[0], desc="Generating Data"):
        prompt = f"{examples}\n\n" \
                 f"INSTRUCTION: Write a sentence with the semantic meaning the same as [{row['#CUE_COLUMN']}], " \
                 f"use the template [{row['FRAGMENT_COLUMN']}], the sentence should be euphemistic not explicit, " \
                 f"the sentence should be [{row['LABEL_COLUMN']}]." \
                 f"\n\n OUTPUT:"
        generated_sentence = get_completion(prompt)
        generated_data.append({
            '#CUE_COLUMN': row['#CUE_COLUMN'],
            'FRAGMENT_COLUMN': row['FRAGMENT_COLUMN'],
            'INSTANCE_COLUMN': generated_sentence,
            'LABEL_COLUMN': row['LABEL_COLUMN']
        })

    # Create a DataFrame from the generated data
    new_df = pd.DataFrame(generated_data)
    new_df.to_csv(output_csv, index=False)
    print("Data generation complete and saved to", csv_output)

In [46]:
input_csv = '../modified_goldstandard.csv'
output_csv = 'llama_augmented_data_test.csv'
# output_csv = 'llama3_augmented_data.csv'
examples_file = 'llama_prompt.txt'

In [47]:
generate_data(input_csv, output_csv, examples_file)

Generating Data:   0%|          | 0/5 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Generating Data:  20%|██        | 1/5 [00:10<00:41, 10.31s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Generating Data:  40%|████      | 2/5 [00:20<00:30, 10.11s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Generating Data:  60%|██████    | 3/5 [00:28<00:18,  9.29s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Generating Data:  80%|████████  | 4/5 [00:38<00:09,  9.39s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Generating Data: 100%|██████████| 5/5 [00:48<00:00,  9.75s/it]Generating Data: 100%|██████████| 5/5 [00:48<00:00,  9.71s/it]

Data generation complete and saved to llama_augmented_data_test.csv



