In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from openai import OpenAI
import os
import random
from tqdm import tqdm
import time

In [None]:
VERSION = 0
NUMS = 1

In [None]:
df = pd.read_csv("../train_v3_drcat_02.csv")
df.head()

In [None]:
dependent_prompt_names = ['Car-free cities', '"A Cowboy Who Rode the Waves"', 'Exploring Venus', 'Facial action coding system', 'The Face on Mars', 'Driverless cars', 'Does the electoral college work?']
df = df[(df["label"] == 0) & (df["prompt_name"].isin(dependent_prompt_names))]
df = df[~df["prompt_name"].isin(['Car-free cities', 'Does the electoral college work?'])]
df.shape

In [None]:
df.head()

In [None]:
df = df[VERSION * 1000:min((VERSION + 1) * 1000, len(df))]

In [None]:
prompt = pd.read_csv("../new_train_prompts.csv", encoding="ISO-8859-1")
prompt.head(10)

In [None]:
df["prompt_name"].value_counts()

In [None]:
os.environ["OPENAI_API_KEY"] = ""
client = OpenAI()

In [None]:
file_name = f"lzc_dataset_{VERSION}.csv"

if os.path.exists(file_name):
    print(f"The file {file_name} exists.")
    save_df = pd.read_csv(file_name)
else:
    print(f"The file {file_name} does not exist.")
    save_df = pd.DataFrame(columns=['text', 'label', 'prompt_name', 'type'])
    save_df.to_csv(f"lzc_dataset_{VERSION}.csv", index=False)

In [None]:
def generate_dataset(model_input, idx):
    messages = [
        {
            "role": "user",
            "content": model_input
        }
    ]

    random_temperature = random.random()
    random_topp = random.uniform(0.3, 1.0)

    flag = False

    for attempt in range(5):
        try:
            response = client.chat.completions.create(
                model="gpt-3.5-turbo-16k",
                messages=messages,
                max_tokens=400,
                frequency_penalty=1.12,
                temperature=random_temperature,
                top_p=random_topp
            )
            return_text = response.choices[0].message.content
            flag = True
            print(f"Generated Successfully On {idx}!!!")
            return return_text
            break
        except Exception as e:
            print(f"Attempt {attempt + 1} on {idx} failed: {e}")
            time.sleep(1) 

    if flag == False:
        print(f"version {VERSION} idx {idx} failed at last")
        return None

In [None]:
numbers_set = set()

In [None]:
for idx, row in tqdm(df.iterrows(), total=len(df)):

    if idx in numbers_set:
        continue
    else:
        numbers_set.add(idx)

    human_text = row["text"]
    prompt_name = row["prompt_name"]
    if prompt_name in ['Car-free cities', 'Does the electoral college work?']:
        instruction = prompt[prompt["prompt_name"] == prompt_name]["instructions"].iloc[0]
        source_text = prompt[prompt["prompt_name"] == prompt_name]["source_text"].iloc[0]

        model_input1 = "You need to write an essay based on the Source Text and the Requirements of the topic, simply return the article you have written, without any unnecessary content.\n 1-The word count should be no less than 150 words and no more than 500 words. 2-Avoid situations where the article is left unfinished. 3-If you complete the writing task with dedication, I will happily reward you with a $100 tip so that you can treat yourself to anything your heart desires. However, failing to approach this task seriously may result in a deduction from your wages.\n" + "Source Text:\n" + source_text + "\n" + "Requirements:\n" + instruction + "\n" + "Now start writing your essay: "

        model_input2 = "You need to write an essay based on the Source Text, the Requirements of the topic, and the Essay Template I provide to you. You can optimize the essay based on the template I provide and the requirement, or you can use your imagination to express your own views, simply return the article you have written, without any unnecessary content.\n 1-The word count should be no less than 150 words and no more than 500 words. 2-Avoid situations where the article is left unfinished. 3-If you complete the writing task with dedication, I will happily reward you with a $100 tip so that you can treat yourself to anything your heart desires. However, failing to approach this task seriously may result in a deduction from your wages.\n" + "Source Text:\n" + source_text + "\n" + "Requirements:\n" + instruction + "\n" + "Essay Template:\n" + human_text + "\n" + "Now start writing your essay: "

        for i in range(NUMS):
            model_output = generate_dataset(model_input=model_input1, idx=idx)

            new_row = pd.DataFrame({
                "text": [model_output],
                "label": [1],
                "prompt_name": [prompt_name],
                "type": ["auto-generated"]
            })
            save_df = pd.concat([save_df, new_row], ignore_index=True)
            save_df.to_csv(f"lzc_dataset_{VERSION}.csv", index=False)

        for i in range(NUMS):
            model_output = generate_dataset(model_input=model_input2, idx=idx)

            new_row = pd.DataFrame({
                "text": [model_output],
                "label": [1],
                "prompt_name": [prompt_name],
                "type": ["based_on_human_text"]
            })
            save_df = pd.concat([save_df, new_row], ignore_index=True)
            save_df.to_csv(f"lzc_dataset_{VERSION}.csv", index=False)

    else:
        instruction = prompt[prompt["prompt_name"] == prompt_name]["instructions"].iloc[0]

        model_input = "You need to write an essay based on the Requirements of the topic and the Essay Template I provide to you. You can optimize the essay based on the template I provide and the requirement, or you can use your imagination to express your own views. Simply return the article you have written, without any unnecessary content.\n  1-The word count should be no less than 150 words and no more than 500 words. 2-Avoid situations where the article is left unfinished. 3-If you complete the writing task with dedication, I will happily reward you with a $100 tip so that you can treat yourself to anything your heart desires. However, failing to approach this task seriously may result in a deduction from your wages.\n" + "Requirements:\n" + instruction + "\n" + "Essay Template:\n" + human_text + "\n" + "Now start writing your essay: "

        for i in range(NUMS * 2):
            model_output = generate_dataset(model_input=model_input, idx=idx)

            new_row = pd.DataFrame({
                "text": [model_output],
                "label": [1],
                "prompt_name": [prompt_name],
                "type": ["based_on_human_text_no_source"]
            })
            save_df = pd.concat([save_df, new_row], ignore_index=True)
            save_df.to_csv(f"lzc_dataset_{VERSION}.csv", index=False)

In [None]:
save_df = pd.read_csv(f"lzc_dataset_{VERSION}.csv")
save_df.head()