In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from openai import OpenAI
import os
import random
from tqdm import tqdm
import time

In [None]:
VERSION = 0

In [None]:
df = pd.read_csv("../train_v3_drcat_02.csv")
df.head()

In [None]:
dependent_prompt_names = ['Car-free cities', '"A Cowboy Who Rode the Waves"', 'Exploring Venus',
                          'Facial action coding system', 'The Face on Mars', 'Driverless cars', 'Does the electoral college work?']
df = df[(df["label"] == 0) & (df["prompt_name"].isin(dependent_prompt_names))]
df.shape

In [None]:
df.head()

In [None]:
df = df[VERSION * 1000:min((VERSION + 1) * 1000, len(df))]

In [None]:
prompt = pd.read_csv("../new_train_prompts.csv", encoding="ISO-8859-1")
prompt.head(10)

In [None]:
df["prompt_name"].value_counts()

In [None]:
os.environ["OPENAI_API_KEY"] = ""
client = OpenAI()

In [None]:
file_name = f"lzc_dataset_0117_{VERSION}.csv"

if os.path.exists(file_name):
    print(f"The file {file_name} exists.")
    save_df = pd.read_csv(file_name)
else:
    print(f"The file {file_name} does not exist.")
    save_df = pd.DataFrame(columns=['text', 'label', 'prompt_name', 'type'])
    save_df.to_csv(f"lzc_dataset_0117_{VERSION}.csv", index=False)

In [None]:
def generate_dataset(model_input, idx):
    messages = [
        {
            "role": "user",
            "content": model_input
        }
    ]

    random_temperature = random.uniform(0.5, 1.0)
    random_topp = random.uniform(0.5, 1.0)

    flag = False

    for attempt in range(5):
        try:
            response = client.chat.completions.create(
                model="gpt-3.5-turbo-1106",
                messages=messages,
                max_tokens=1024,
                frequency_penalty=1.12,
                temperature=random_temperature,
                top_p=random_topp
            )
            return_text = response.choices[0].message.content
            flag = True
            print(f"Generated Successfully On {idx}!!!")
            return return_text
            break
        except Exception as e:
            print(f"Attempt {attempt + 1} on {idx} failed: {e}")
            time.sleep(1)

    if flag == False:
        print(f"version {VERSION} idx {idx} failed at last")
        return None

In [None]:
numbers_set = set()

In [None]:
for idx, row in tqdm(df.iterrows(), total=len(df)):

    if idx in numbers_set:
        continue
    else:
        numbers_set.add(idx)

    human_text = row["text"]
    prompt_name = row["prompt_name"]

    model_input1 = "The following is a human-written article. Now, please go through the following text, optimizing sentence structures, correcting grammatical errors, while ensuring that the meaning of the article remains unchanged. Just return the modified article.\n" + "article: " + human_text

    model_input2 = "The following is a human-written article. Now, please rewrite this article in your writing style, also optimize sentence structures and correct grammatical errors. You must ensure that the meaning of the article remains unchanged. Just return the modified article.\n" + "article: " + human_text

    model_input3 = "The following is a human-written article. Now, please rewrite this article in your writing style. You must ensure that the meaning of the article remains unchanged. Just return the modified article.\n" + "article: " + human_text

    for i in range(1):
        model_output = generate_dataset(model_input=model_input1, idx=idx)

        new_row = pd.DataFrame({
            "text": [model_output],
            "label": [1],
            "prompt_name": [prompt_name],
            "type": ["auto-generated"]
        })
        save_df = pd.concat([save_df, new_row], ignore_index=True)
        save_df.to_csv(f"lzc_dataset_0117_{VERSION}.csv", index=False)

    for i in range(1):
        model_output = generate_dataset(model_input=model_input2, idx=idx)

        new_row = pd.DataFrame({
            "text": [model_output],
            "label": [1],
            "prompt_name": [prompt_name],
            "type": ["auto-generated"]
        })
        save_df = pd.concat([save_df, new_row], ignore_index=True)
        save_df.to_csv(f"lzc_dataset_0117_{VERSION}.csv", index=False)

    for i in range(1):
        model_output = generate_dataset(model_input=model_input3, idx=idx)

        new_row = pd.DataFrame({
            "text": [model_output],
            "label": [1],
            "prompt_name": [prompt_name],
            "type": ["auto-generated"]
        })
        save_df = pd.concat([save_df, new_row], ignore_index=True)
        save_df.to_csv(f"lzc_dataset_0117_{VERSION}.csv", index=False)

In [None]:
save_df = pd.read_csv(f"lzc_dataset_0117_{VERSION}.csv")
save_df.head()