In [1]:
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import os
from tqdm import tqdm

In [2]:
def setup_flan_t5():
    device = "cuda:0"  # Change as needed based on your GPU setup
    model_id = "google/flan-t5-xxl"
    model = AutoModelForSeq2SeqLM.from_pretrained(model_id).to(device)  # Ensure the model is on GPU
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    return model, tokenizer, device

In [3]:
model, tokenizer, device = setup_flan_t5()



[2024-04-26 06:44:47,857] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [4]:
inputs = tokenizer("A step by step recipe to make bolognese pasta:", return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_length=120)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])

Add the ground beef to a large skillet and cook over medium heat until browned, about 8 minutes. Add the onions and cook until softened, about 5 minutes. Add the garlic and cook for 1 minute. Add the wine and cook until evaporated, about 1 minute. Add the tomatoes, tomato paste, salt, pepper, and thyme. Bring to a boil, then reduce the heat and simmer for 1 hour. Add the parsley and cook for 1 minute. Serve over cooked pasta.


In [5]:
def get_completion(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_length=120)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    
# Example usage
prompt = "do you know incontext learning?"
output_sentence = get_completion(prompt)
print(output_sentence if output_sentence else "Prompt was skipped or an error occurred.")


# from openai import OpenAI
# client = OpenAI()

# completion = client.chat.completions.create(
#   model="gpt-4-turbo-2024-04-09",
#   messages=[
#     {"role": "system", "content": "You are a helpful assistant."},
#     {"role": "user", "content": "Hello!"}
#   ]
# )

# print(completion.choices[0].message)

a method of learning that is based on the context of the situation


In [6]:
def load_examples(file_path):
    with open(file_path, 'r') as file:
        examples = file.read().strip()
    return examples

In [7]:
def generate_data(csv_input, csv_output, examples_file):
    # Load the examples for few-shot learning
    examples = load_examples(examples_file)
    # Load the CSV file
    goldstandard = pd.read_csv(csv_input)


    # ------------------------------test
    # goldstandard = goldstandard.head(5) 

    
    # Initialize the output DataFrame
    generated_data = []

    # Process each row in the DataFrame
    for idx, row in tqdm(goldstandard.iterrows(), total=goldstandard.shape[0], desc="Generating Data"):
        prompt = f"{examples}\n\n" \
                 f"INSTRUCTION: Write a sentence with the semantic meaning the same as [{row['#CUE_COLUMN']}], " \
                 f"use the template [{row['FRAGMENT_COLUMN']}], the sentence should be euphemistic not explicit, " \
                 f"the sentence should be [{row['LABEL_COLUMN']}]." \
                 f"\n\n OUTPUT:"
        generated_sentence = get_completion(prompt)
        generated_data.append({
            '#CUE_COLUMN': row['#CUE_COLUMN'],
            'FRAGMENT_COLUMN': row['FRAGMENT_COLUMN'],
            'INSTANCE_COLUMN': generated_sentence,
            'LABEL_COLUMN': row['LABEL_COLUMN']
        })

    # Create a DataFrame from the generated data
    new_df = pd.DataFrame(generated_data)
    new_df.to_csv(output_csv, index=False)
    print("Data generation complete and saved to", csv_output)

In [8]:
input_csv = '../modified_goldstandard.csv'
# output_csv = 'flant5_augmented_data_test.csv'
output_csv = 'flant5_augmented_data.csv'
examples_file = 'flant5_prompt.txt'

In [9]:
generate_data(input_csv, output_csv, examples_file)

Generating Data:   0%|          | 0/1797 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (940 > 512). Running this sequence through the model will result in indexing errors
Generating Data:   0%|          | 1/1797 [00:01<34:42,  1.16s/it]Generating Data:   0%|          | 2/1797 [00:02<33:59,  1.14s/it]Generating Data:   0%|          | 3/1797 [00:03<35:05,  1.17s/it]Generating Data:   0%|          | 4/1797 [00:04<34:27,  1.15s/it]Generating Data:   0%|          | 5/1797 [00:06<37:52,  1.27s/it]Generating Data:   0%|          | 6/1797 [00:07<40:25,  1.35s/it]Generating Data:   0%|          | 7/1797 [00:08<38:09,  1.28s/it]Generating Data:   0%|          | 8/1797 [00:10<39:04,  1.31s/it]Generating Data:   1%|          | 9/1797 [00:11<41:17,  1.39s/it]Generating Data:   1%|          | 10/1797 [00:13<41:00,  1.38s/it]Generating Data:   1%|          | 11/1797 [00:14<41:22,  1.39s/it]Generating Data:   1%|          

Data generation complete and saved to flant5_augmented_data.csv



