In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from openai import OpenAI
import os
import random
from tqdm import tqdm
import time

In [2]:
VERSION = 0
NUMS = 1

In [3]:
df = pd.read_csv("../train_v3_drcat_02.csv")
df.head()

Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven,model
0,Phones\n\nModern humans today are always on th...,0,Phones and driving,persuade_corpus,False,human
1,This essay will explain if drivers should or s...,0,Phones and driving,persuade_corpus,False,human
2,Driving while the use of cellular devices\n\nT...,0,Phones and driving,persuade_corpus,False,human
3,Phones & Driving\n\nDrivers should not be able...,0,Phones and driving,persuade_corpus,False,human
4,Cell Phone Operation While Driving\n\nThe abil...,0,Phones and driving,persuade_corpus,False,human


In [4]:
dependent_prompt_names = ['Car-free cities', '"A Cowboy Who Rode the Waves"', 'Exploring Venus', 'Facial action coding system', 'The Face on Mars', 'Driverless cars', 'Does the electoral college work?']
df = df[(df["label"] == 0) & (df["prompt_name"].isin(dependent_prompt_names))]
df.shape

(14249, 6)

In [5]:
df.head()

Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven,model
1168,Cars have been around for awhile and they have...,0,Car-free cities,persuade_corpus,True,human
1169,Have you ever thought what it would be like no...,0,Car-free cities,persuade_corpus,True,human
1170,What you are about to read is going to give yo...,0,Car-free cities,persuade_corpus,True,human
1171,cars have many flaws nd and in this day and ag...,0,Car-free cities,persuade_corpus,True,human
1172,There are many advantages of limiting car usag...,0,Car-free cities,persuade_corpus,True,human


In [6]:
df = df[VERSION * 1000:min((VERSION + 1) * 1000, len(df))]

In [7]:
prompt = pd.read_csv("../new_train_prompts.csv", encoding="ISO-8859-1")
prompt.head(10)

Unnamed: 0,prompt_id,prompt_name,instructions,source_text
0,0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."
1,1,Does the electoral college work?,Write a letter to your state senator in which ...,# What Is the Electoral College? by the Office...
2,2,Exploring Venus,"In ""The Challenge of Exploring Venus,"" the aut...",# Unmasking the Face on Mars by Dr. Tony Phill...
3,3,The Face on Mars,You have read the article 'Unmasking the Face ...,
4,4,Facial action coding system,"In the article ""Making Mona Lisa Smile,"" the a...",1. **FACS Definition**:\n - The Facial Actio...
5,5,Driverless cars,"In the article ¡°Driverless Cars are Coming,¡±...","# Autonomous Vehicles: Evolution, Challenges, ..."
6,6,"""A Cowboy Who Rode the Waves""","You have just read the article, 'A Cowboy Who ...",# Seagoing Cowboys: A Historical Overview\n\n1...


In [8]:
df["prompt_name"].value_counts()

prompt_name
Car-free cities    1000
Name: count, dtype: int64

In [9]:
os.environ["OPENAI_API_KEY"] = ""
client = OpenAI()

In [10]:
file_name = f"lzc_dataset_0117_{VERSION}.csv"

if os.path.exists(file_name):
    print(f"The file {file_name} exists.")
    save_df = pd.read_csv(file_name)
else:
    print(f"The file {file_name} does not exist.")
    save_df = pd.DataFrame(columns=['text', 'label', 'prompt_name', 'type'])
    save_df.to_csv(f"lzc_dataset_0117_{VERSION}.csv", index=False)

The file lzc_dataset_0117_0.csv does not exist.


In [11]:
def generate_dataset(model_input, idx):
    messages = [
        {
            "role": "user",
            "content": model_input
        }
    ]

    random_temperature = random.random()
    random_topp = random.uniform(0.5, 1.0)

    flag = False

    for attempt in range(5):
        try:
            response = client.chat.completions.create(
                model="gpt-3.5-turbo-1106",
                messages=messages,
                max_tokens=1024,
                frequency_penalty=1.12,
                temperature=random_temperature,
                top_p=random_topp
            )
            return_text = response.choices[0].message.content
            flag = True
            print(f"Generated Successfully On {idx}!!!")
            return return_text
            break
        except Exception as e:
            print(f"Attempt {attempt + 1} on {idx} failed: {e}")
            time.sleep(1) 

    if flag == False:
        print(f"version {VERSION} idx {idx} failed at last")
        return None

In [12]:
numbers_set = set()

In [13]:
for idx, row in tqdm(df.iterrows(), total=len(df)):

    if idx in numbers_set:
        continue
    else:
        numbers_set.add(idx)

    human_text = row["text"]
    prompt_name = row["prompt_name"]

    model_input = "The following is a human-written article. Now, please go through the following text, optimizing sentence structures, correcting grammatical errors, while ensuring that the meaning of the article remains unchanged. Just return the modified article.\n" + \
    "article: " + human_text

    for i in range(3):
        model_output = generate_dataset(model_input=model_input, idx=idx)

        new_row = pd.DataFrame({
            "text": [model_output],
            "label": [1],
            "prompt_name": [prompt_name],
            "type": ["auto-generated"]
        })
        save_df = pd.concat([save_df, new_row], ignore_index=True)
        save_df.to_csv(f"lzc_dataset_0117_{VERSION}.csv", index=False)

  0%|          | 0/1000 [00:00<?, ?it/s]

Generated Successfully On 1168!!!
Generated Successfully On 1168!!!


  0%|          | 1/1000 [00:30<8:23:57, 30.27s/it]

Generated Successfully On 1168!!!
Generated Successfully On 1169!!!
Generated Successfully On 1169!!!


  0%|          | 2/1000 [00:45<6:00:31, 21.67s/it]

Generated Successfully On 1169!!!
Generated Successfully On 1170!!!
Generated Successfully On 1170!!!


  0%|          | 3/1000 [01:12<6:37:16, 23.91s/it]

Generated Successfully On 1170!!!
Generated Successfully On 1171!!!
Generated Successfully On 1171!!!


  0%|          | 4/1000 [01:29<5:51:49, 21.19s/it]

Generated Successfully On 1171!!!
Generated Successfully On 1172!!!
Generated Successfully On 1172!!!


  0%|          | 5/1000 [01:54<6:14:47, 22.60s/it]

Generated Successfully On 1172!!!
Generated Successfully On 1173!!!
Generated Successfully On 1173!!!


  1%|          | 6/1000 [02:31<7:36:06, 27.53s/it]

Generated Successfully On 1173!!!
Generated Successfully On 1174!!!
Generated Successfully On 1174!!!


  1%|          | 7/1000 [02:58<7:32:24, 27.34s/it]

Generated Successfully On 1174!!!
Generated Successfully On 1175!!!
Generated Successfully On 1175!!!


  1%|          | 8/1000 [03:30<7:53:03, 28.61s/it]

Generated Successfully On 1175!!!
Generated Successfully On 1176!!!
Generated Successfully On 1176!!!


  1%|          | 9/1000 [04:11<8:58:26, 32.60s/it]

Generated Successfully On 1176!!!
Generated Successfully On 1177!!!
Generated Successfully On 1177!!!


  1%|          | 10/1000 [04:48<9:22:53, 34.11s/it]

Generated Successfully On 1177!!!
Generated Successfully On 1178!!!
Generated Successfully On 1178!!!


  1%|          | 11/1000 [05:18<9:01:09, 32.83s/it]

Generated Successfully On 1178!!!
Generated Successfully On 1179!!!
Generated Successfully On 1179!!!


  1%|          | 12/1000 [05:45<8:29:17, 30.93s/it]

Generated Successfully On 1179!!!
Generated Successfully On 1180!!!
Generated Successfully On 1180!!!


  1%|▏         | 13/1000 [06:31<9:44:07, 35.51s/it]

Generated Successfully On 1180!!!
Generated Successfully On 1181!!!
Generated Successfully On 1181!!!


  1%|▏         | 14/1000 [07:05<9:35:27, 35.02s/it]

Generated Successfully On 1181!!!
Generated Successfully On 1182!!!
Generated Successfully On 1182!!!


  2%|▏         | 15/1000 [07:32<8:57:10, 32.72s/it]

Generated Successfully On 1182!!!
Generated Successfully On 1183!!!
Generated Successfully On 1183!!!


  2%|▏         | 16/1000 [07:55<8:08:26, 29.78s/it]

Generated Successfully On 1183!!!
Generated Successfully On 1184!!!
Generated Successfully On 1184!!!


  2%|▏         | 17/1000 [08:25<8:07:36, 29.76s/it]

Generated Successfully On 1184!!!
Generated Successfully On 1185!!!
Generated Successfully On 1185!!!


  2%|▏         | 18/1000 [08:47<7:28:30, 27.40s/it]

Generated Successfully On 1185!!!
Generated Successfully On 1186!!!
Generated Successfully On 1186!!!


  2%|▏         | 19/1000 [09:12<7:15:50, 26.66s/it]

Generated Successfully On 1186!!!
Generated Successfully On 1187!!!
Generated Successfully On 1187!!!


  2%|▏         | 20/1000 [09:29<6:27:12, 23.71s/it]

Generated Successfully On 1187!!!
Generated Successfully On 1188!!!
Generated Successfully On 1188!!!


  2%|▏         | 21/1000 [10:01<7:07:32, 26.20s/it]

Generated Successfully On 1188!!!
Generated Successfully On 1189!!!
Generated Successfully On 1189!!!


  2%|▏         | 22/1000 [10:41<8:16:02, 30.43s/it]

Generated Successfully On 1189!!!
Generated Successfully On 1190!!!
Generated Successfully On 1190!!!


In [None]:
save_df = pd.read_csv(f"lzc_dataset_0117_{VERSION}.csv")
save_df.head()