## Make a request to a ChatGPT model

The following example is from:  https://platform.openai.com/docs/quickstart?context=python


In [1]:
# from openai import OpenAI
# client = OpenAI()
# 
# completion = client.chat.completions.create(
#   model="gpt-3.5-turbo",
#   messages=[
#     {"role": "system", "content": "You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},
#     {"role": "user", "content": "Compose a poem that explains the concept of recursion in programming."}
#   ]
# )

# print(completion.choices[0].message)

## Try generating related tweets

In [2]:
import pandas as pd
df_train = pd.read_csv("./data/train_clean_v03.csv")
# train_ids = df_train['id']
print(f"shape of base data with spillovers fixed and duplicates removed: {df_train.shape}")
df_train_class0 = df_train.loc[df_train['target'] == 0, ['id', 'text', 'target']]
df_train_class1 = df_train.loc[df_train['target'] == 1, ['id', 'text', 'target']]
print(f"shape of same base data with target == 0: {df_train_class0.shape}")
print(f"shape of same base data with target == 1: {df_train_class1.shape}")
print()

data = {'id': [], 'text': [], 'target': []}
df_aug_class0 = pd.DataFrame(data, columns = ['id', 'text', 'target'])
df_aug_class1 = pd.DataFrame(data, columns = ['id', 'text', 'target'])

for i, row in df_train_class1.iterrows():
    if i < 5:
        print(f"index={i} | {row['id']},{row['text']},{row['target']}")
    df_aug_class1.loc[len(df_aug_class1.index)] = [row['id'], row['text'], row['target']]

print(f"shape of df_aug_class1: {df_aug_class1.shape}")
print()
df_aug_class1.head(10)

shape of base data with spillovers fixed and duplicates removed: (7485, 5)
shape of same base data with target == 0: (4297, 3)
shape of same base data with target == 1: (3188, 3)

index=0 | 1,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
index=1 | 4,Forest fire near La Ronge Sask. Canada,1
index=2 | 5,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
index=3 | 6,13,000 people receive #wildfires evacuation orders in California ,1
index=4 | 7,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school ,1
shape of df_aug_class1: (3188, 3)



Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1
5,8,#RockyFire Update => California Hwy. 20 closed...,1
6,10,#flood #disaster Heavy rain causes flash flood...,1
7,13,I'm on top of the hill and I can see a fire in...,1
8,14,There's an emergency evacuation happening now ...,1
9,15,I'm afraid that the tornado is coming to our a...,1


In [3]:
# df_train_class0.head(22)
base_tweet_class0 = df_train_class0['text'].loc[df_train_class0['id'] == 57].values[0]
base_tweet_class1 = df_train_class1['text'].loc[df_train_class1['id'] == 56].values[0]
print(base_tweet_class0)
print(base_tweet_class1)

Ablaze for you Lord :D
Barbados #Bridgetown JAMAICA ÛÒ Two cars set ablaze: SANTA CRUZ ÛÓ Head of the St Elizabeth Police Superintende...  http://t.co/wDUEaj8Q4J


In [4]:
# set up a single request
context = "You are a fiction writer who has observed a disaster and likes to tweet."
start_prompt_class0 = "Write me a tweet similar to this one, under 141 characters, " + \
                      "does not use contractions, but refers to a different activity and location: "  # "...activity...
start_prompt_class1 = "Write me a tweet similar to this one, under 141 characters, " + \
                      "does not use contractions, but refers to a different disaster and location: "  # "...disaster...
complete_prompt_class0 = start_prompt_class0 + base_tweet_class0
complete_prompt_class1 = start_prompt_class1 + base_tweet_class1
print(complete_prompt_class0)
print(complete_prompt_class1)

Write me a tweet similar to this one, under 141 characters, does not use contractions, but refers to a different activity and location: Ablaze for you Lord :D
Write me a tweet similar to this one, under 141 characters, does not use contractions, but refers to a different disaster and location: Barbados #Bridgetown JAMAICA ÛÒ Two cars set ablaze: SANTA CRUZ ÛÓ Head of the St Elizabeth Police Superintende...  http://t.co/wDUEaj8Q4J


In [5]:
# test batch of 3 tweets
# df_aug_class1_test3 = df_aug_class1.iloc[:3]
# print(context)
# df_aug_class1_test3

In [6]:
# df_train['id'].max()  # 10873, add 20,000 to id of augmented samples

In [7]:
# df_train_chunk = df_train_class1.iloc[:100]
# print(df_train_chunk.shape)
# df_train_chunk.head(3)

In [8]:
from openai import OpenAI
import projtools as pt

client = OpenAI()

aug_offest = 20000
aug_tweets_class0 = {}
aug_tweets_class1 = {}
df_train_chunk_class0 = df_train_class0.iloc[:100]
df_train_chunk_class1 = df_train_class1.iloc[:100]

In [None]:
import time

# generate 100 class 0 tweets
t0 = time.time()
rows_processed = 1
for i, row in df_train_chunk_class0.iterrows():
    prompt_content = start_prompt_class0 + row['text']
    gen_tweet = pt.get_aug_tweet(context, prompt_content)
    aug_id = row['id'] + aug_offest
    aug_tweets_class0[aug_id] = gen_tweet
    if rows_processed % 10 == 0:
        print(f"processing row {rows_processed} with id {row['id']}")
    rows_processed += 1
t1 = time.time()

In [17]:
gen_tweet_count = 100
print(f"time to do {gen_tweet_count} is {(t1-t0)/60.} minutes")

time to do 100 is 3.6672049085299174 minutes


In [18]:
pt.write_aug_tweets(aug_tweets_class0, 0, "./data/aug_tweets_class0_v01prompt_0000_0100.csv")

True

In [19]:
# generate 100 class 1 tweets
t0 = time.time()
rows_processed = 1
for i, row in df_train_chunk_class1.iterrows():
    prompt_content = start_prompt_class1 + row['text']
    gen_tweet = pt.get_aug_tweet(context, prompt_content)
    aug_id = row['id'] + aug_offest
    aug_tweets_class1[aug_id] = gen_tweet
    if rows_processed % 10 == 0:
        print(f"processing row {rows_processed} with id {row['id']}")
    rows_processed += 1
t1 = time.time()
print(f"time to do {gen_tweet_count} is {(t1-t0)/60.} minutes")
pt.write_aug_tweets(aug_tweets_class1, 1, "./data/aug_tweets_class1_v01prompt_0000_0100.csv")

processing row 10 with id 15
processing row 20 with id 66
processing row 30 with id 98
processing row 40 with id 126
processing row 50 with id 139
processing row 60 with id 208
processing row 70 with id 222
processing row 80 with id 244
processing row 90 with id 262
processing row 100 with id 289
time to do 100 is 4.634544682502747 minutes


True

In [None]:
import pickle

pickle_file = './data/first3_class1.pickle'
# Store data (serialize) - commented out because it's already be done so just need to read
with open(pickle_file, 'wb') as handle:
    pickle.dump(aug_responses, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Load data (deserialize)
with open(pickle_file, 'rb') as handle:
    aug_responses = pickle.load(handle)

# print(aug_responses == unserialized_data)
print(aug_responses)

In [None]:


print(aug_responses.keys())
for key in aug_responses.keys():
    cgpt_response = aug_responses[key].choices[0].message.content
    print(cgpt_response)