## Make a request to a ChatGPT model

The following working example is from:  https://platform.openai.com/docs/quickstart?context=python


In [1]:
# from openai import OpenAI
# client = OpenAI()
# 
# completion = client.chat.completions.create(
#   model="gpt-3.5-turbo",
#   messages=[
#     {"role": "system", "content": "You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},
#     {"role": "user", "content": "Compose a poem that explains the concept of recursion in programming."}
#   ]
# )

# print(completion.choices[0].message)

## Try generating related tweets

In [2]:
import pandas as pd
# break out classes from original training data
df_train = pd.read_csv("./data/train_clean_v03.csv", index_col='id')
# train_ids = df_train['id']
print(f"shape of base data with spillovers fixed and duplicates removed: {df_train.shape}")
df_train_class0 = df_train.loc[df_train['target'] == 0, ['text', 'target']]
df_train_class1 = df_train.loc[df_train['target'] == 1, ['text', 'target']]
print(f"shape of base data with target == 0: {df_train_class0.shape}")
print(f"shape of base data with target == 1: {df_train_class1.shape}")
print(f"max index value for class 0 training data: {df_train_class0.index.max()}")
print()
df_train_class0.head()

shape of base data with spillovers fixed and duplicates removed: (7485, 4)
shape of base data with target == 0: (4297, 2)
shape of base data with target == 1: (3188, 2)
max index value for class 0 training data: 10848



Unnamed: 0_level_0,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1
23,What's up man?,0
24,I love fruits,0
25,Summer is lovely,0
26,My car is so fast,0
28,What a goooooooaaaaaal!!!!!!,0


In [3]:
print(f"max index value for class 1 training data: {df_train_class1.index.max()}")
df_train_class1.head()

max index value for class 1 training data: 10873


Unnamed: 0_level_0,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Our Deeds are the Reason of this #earthquake M...,1
4,Forest fire near La Ronge Sask. Canada,1
5,All residents asked to 'shelter in place' are ...,1
6,"13,000 people receive #wildfires evacuation or...",1
7,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
# 
id_aug_class0 = []
text_aug_class0 = []
target_aug_class0 = []

aug_offset = 20000  # max tweet index is 10873

# load the class 0 training samples in a df to use for augmentation
for index, row in df_train_class0.iterrows():
    id_aug_class0.append(index + aug_offset)
    text_aug_class0.append(row['text'])
    target_aug_class0.append(row['target'])

df_aug_class0 = pd.DataFrame({
    'id': id_aug_class0,
    'text': text_aug_class0,
    'target': target_aug_class0
})

print(f"shape of df_aug_class0: {df_aug_class0.shape}")
print()
df_aug_class0.head(10)

shape of df_aug_class0: (4297, 3)



Unnamed: 0,id,text,target
0,20023,What's up man?,0
1,20024,I love fruits,0
2,20025,Summer is lovely,0
3,20026,My car is so fast,0
4,20028,What a goooooooaaaaaal!!!!!!,0
5,20031,this is ridiculous....,0
6,20032,London is cool ;),0
7,20033,Love skiing,0
8,20034,What a wonderful day!,0
9,20036,LOOOOOOL,0


In [5]:
id_aug_class1 = []
text_aug_class1 = []
target_aug_class1 = []

# load the class 1 training samples in a df to use for augmentation
for index, row in df_train_class1.iterrows():
    id_aug_class1.append(index + aug_offset)
    text_aug_class1.append(row['text'])
    target_aug_class1.append(row['target'])

df_aug_class1 = pd.DataFrame({
    'id': id_aug_class1,
    'text': text_aug_class1,
    'target': target_aug_class1
})

print(f"shape of df_aug_class1: {df_aug_class1.shape}")
print()
df_aug_class1.head(10)

shape of df_aug_class1: (3188, 3)



Unnamed: 0,id,text,target
0,20001,Our Deeds are the Reason of this #earthquake M...,1
1,20004,Forest fire near La Ronge Sask. Canada,1
2,20005,All residents asked to 'shelter in place' are ...,1
3,20006,"13,000 people receive #wildfires evacuation or...",1
4,20007,Just got sent this photo from Ruby #Alaska as ...,1
5,20008,#RockyFire Update => California Hwy. 20 closed...,1
6,20010,#flood #disaster Heavy rain causes flash flood...,1
7,20013,I'm on top of the hill and I can see a fire in...,1
8,20014,There's an emergency evacuation happening now ...,1
9,20015,I'm afraid that the tornado is coming to our a...,1


## Set up the prompts for augmented data generation

In [6]:
import projtools as pt
import pandas as pd

df_prompt_data = pd.read_csv("./data/prompt_log.csv")
df_prompt_data
# get current context and prompt prefix
# context, start_prompt_class0 = pt.get_prompt_setup(prompt_date='2024-02-28', target_class=0)

Unnamed: 0,date,prompt_component,version,class,content,notes
0,2024-02-26,context,1,2,You are a fiction writer who has observed a di...,initial context for both classes used for send...
1,2024-02-26,prompt_prefix,1,0,"Write me a tweet similar to this one, under 14...",initial class 0 prompt used in web interface
2,2024-02-26,prompt_prefix,1,1,"Write me a tweet similar to this one, under 14...",initial class 1 prompt used in web interface
3,2024-02-27,context,2,2,You are a fiction writer who has observed a di...,same as version 1
4,2024-02-27,prompt_prefix,2,0,"Write me a tweet similar to this one, under 14...",avoid having to expand contractions on generat...
5,2024-02-27,prompt_prefix,2,1,"Write me a tweet similar to this one, under 14...",avoid having to expand contractions on generat...
6,2024-02-29,context,3,2,You are a fiction writer who has observed a di...,same as version 2
7,2024-02-29,prompt_prefix,3,0,"Write me a tweet similar to this one, under 14...",fix issue with generated tweet quoting
8,2024-02-29,prompt_prefix,3,1,"Write me a tweet similar to this one, under 14...",fix issue with generated tweet quoting
9,2024-03-23,context,4,1,You are a world class reporter who has observe...,change fiction writer to reporter


In [8]:
df_train_class0.head()

Unnamed: 0_level_0,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1
23,What's up man?,0
24,I love fruits,0
25,Summer is lovely,0
26,My car is so fast,0
28,What a goooooooaaaaaal!!!!!!,0


In [9]:
# grab a couple of tweets to test
base_tweet_class0 = df_train_class0['text'].loc[df_train_class0.index == 57].values[0]
base_tweet_class1 = df_train_class1['text'].loc[df_train_class1.index == 56].values[0]
print(base_tweet_class0)
print(base_tweet_class1)

Ablaze for you Lord :D
Barbados #Bridgetown JAMAICA ÛÒ Two cars set ablaze: SANTA CRUZ ÛÓ Head of the St Elizabeth Police Superintende...  http://t.co/wDUEaj8Q4J


In [11]:
# set up a single request
# context = "You are a fiction writer who has observed a disaster and likes to tweet."
# start_prompt_class0 = "Write me a tweet similar to this one, under 141 characters, " + \
#                       "does not use contractions, but refers to a different activity and location: "  # "...activity...
# start_prompt_class1 = "Write me a tweet similar to this one, under 141 characters, " + \
#                       "does not use contractions, but refers to a different disaster and location: "  # "...disaster...
current_prompt = pt.get_prompt_setup()
context0 = current_prompt['context0']['text']
context1 = current_prompt['context1']['text']

start_prompt_class0 = current_prompt['prefix_class0']['text']
start_prompt_class1 = current_prompt['prefix_class1']['text']

complete_prompt_class0 = start_prompt_class0 + base_tweet_class0  # base_tweet_... defined in cell 2
complete_prompt_class1 = start_prompt_class1 + base_tweet_class1

print(f"class 0 context: {context0}")
print(f"class 1 context: {context1}")
print()
print(complete_prompt_class0)
print(complete_prompt_class1)

class 0 context: You are social commentator who enjoys diverse opinions and likes to tweet.
class 1 context: You are a world class reporter who has observed a disaster and likes to tweet.

Write me a tweet similar to this one, under 141 characters, does not contain double quotes, but refers to a different activity, feeling and location: Ablaze for you Lord :D
Write me a tweet similar to this one, under 141 characters, does not contain double quotes, but refers to a different disaster and location: Barbados #Bridgetown JAMAICA ÛÒ Two cars set ablaze: SANTA CRUZ ÛÓ Head of the St Elizabeth Police Superintende...  http://t.co/wDUEaj8Q4J


In [None]:
# test batch of 3 tweets
# df_aug_class1_test3 = df_aug_class1.iloc[:3]
# print(context)
# df_aug_class1_test3

In [None]:
# df_train['id'].max()  # 10873, add 20,000 to id of augmented samples

In [28]:
class0_range_starts = tuple(range(0, 4001, 200))
class1_range_starts = tuple(range(0, 2801, 200))
# print(range_starts)
# print(range_ends)
offset = 199
tweet_batches_class0 = [(start0, start0+offset) for start0 in class0_range_starts]
tweet_batches_class1 = [(start1, start1+offset) for start1 in class1_range_starts]
tweet_batches_class0.append((4200, 4297))
tweet_batches_class1.append((3100, 3189))

print(tweet_batches_class0)
print()
print(tweet_batches_class1)

[(0, 199), (200, 399), (400, 599), (600, 799), (800, 999), (1000, 1199), (1200, 1399), (1400, 1599), (1600, 1799), (1800, 1999), (2000, 2199), (2200, 2399), (2400, 2599), (2600, 2799), (2800, 2999), (3000, 3199), (3200, 3399), (3400, 3599), (3600, 3799), (3800, 3999), (4000, 4199), (4200, 4297)]

[(0, 199), (200, 399), (400, 599), (600, 799), (800, 999), (1000, 1199), (1200, 1399), (1400, 1599), (1600, 1799), (1800, 1999), (2000, 2199), (2200, 2399), (2400, 2599), (2600, 2799), (2800, 2999), (3100, 3189)]


In [16]:
print(df_train_class0.shape, df_train_class1.shape)

(4297, 2) (3188, 2)


In [29]:
from openai import OpenAI

client = OpenAI()

aug_offest = 20000
aug_tweets_class0 = {}
aug_tweets_class1 = {}
df_aug_class0_chunk = df_aug_class0.iloc[tweet_batches_class0[0][0]:(tweet_batches_class0[0][1]+1)]
df_aug_class1_chunk = df_aug_class1.iloc[tweet_batches_class1[0][0]:(tweet_batches_class1[0][1]+1)]

In [30]:
df_aug_class0_chunk.head()

Unnamed: 0,id,text,target
0,20023,What's up man?,0
1,20024,I love fruits,0
2,20025,Summer is lovely,0
3,20026,My car is so fast,0
4,20028,What a goooooooaaaaaal!!!!!!,0


In [31]:
print(df_aug_class1_chunk.shape)
df_aug_class1_chunk.tail()

(200, 3)


Unnamed: 0,id,text,target
195,20690,[infowars] Nashville Theater Attack: Will Gun...,1
196,20699,Cop injured in gunfight as militants attack Ud...,1
197,20709,Israeli helicopters that attacked civilians in...,1
198,20710,Christian Attacked by Muslims at the Temple Mo...,1
199,20712,Christian Attacked by Muslims at the Temple Mo...,1


In [32]:
import time

df_aug_class0_chunk = df_aug_class0.iloc[tweet_batches_class0[0][0]:(tweet_batches_class0[0][1]+1)]

# generate 200 class 0 tweets
t0 = time.time()
rows_processed = 1
aug_tweet_ids = []
aug_tweets_texts = []
aug_tweets_targets = []
for i, row in df_aug_class0_chunk.iterrows():
    prompt_content = start_prompt_class0 + row['text']
    gen_tweet = pt.get_aug_tweet(context0, prompt_content)
    aug_tweets_texts.append(gen_tweet)
    aug_tweet_ids.append(row['id'])
    aug_tweets_targets.append(row['target'])
    if rows_processed % 10 == 0:
        print(f"processing row {rows_processed} with id {row['id']}")
    rows_processed += 1
t1 = time.time()

processing row 10 with id 20036
processing row 20 with id 20054
processing row 30 with id 20076
processing row 40 with id 20100
processing row 50 with id 20146
processing row 60 with id 20162
processing row 70 with id 20178
processing row 80 with id 20194
processing row 90 with id 20260
processing row 100 with id 20280
processing row 110 with id 20303
processing row 120 with id 20331
processing row 130 with id 20349
processing row 140 with id 20375
processing row 150 with id 20402
processing row 160 with id 20423
processing row 170 with id 20446
processing row 180 with id 20457
processing row 190 with id 20474
processing row 200 with id 20485


In [33]:
gen_tweet_count = 200
print(f"time to do {gen_tweet_count} is {(t1-t0)/60.} minutes")

time to do 200 is 5.9875851511955265 minutes


In [41]:
dict_aug_tweets = {}
for i in range(0, len(aug_tweet_ids)):
    dict_aug_tweets[aug_tweet_ids[i]] = aug_tweets_texts[i]

print(len(dict_aug_tweets.keys()))
print(list(dict_aug_tweets.keys())[:10])
print(list(dict_aug_tweets.values())[:10])

200
[20023, 20024, 20025, 20026, 20028, 20031, 20032, 20033, 20034, 20036]
['Hey there! How are we feeling today at the beach? 🌊 #BeachDayVibes', 'Obsessed with live music in the park - the vibe, the energy, just everything about it 🎶 #MusicIsLife', '"Autumn vibes in the park are just magical 🍂🌳 #FallFeels #NatureVibes"', 'Speeding through the city in my mind, thoughts racing faster than a Bugatti on the Autobahn! #mentalacceleration', '"Such an exhilarating rush of adrenaline on that mountain peak! #adventure #thrill #nature"', '"Absolutely surreal... experiencing pure bliss in the heart of the city right now. #urbanlife #serenity"', 'Absolutely vibing in Barcelona right now 🌞 #goodvibes #travelgoals', '"Nothing beats that rush of adrenaline on a powder day in the mountains. #skiing"', 'Feeling absolutely content with the world while sipping coffee at my favorite cafe. Today is a gem! #grateful', '"Feeling absolutely ecstatic at the beach right now 🌊🌞 #ParadiseVibes"']


In [37]:
out_file_name0 = f"./data/prompts_v04/aug_tweets_class0_v04prompt_{tweet_batches_class0[0][0]:04}_{(tweet_batches_class0[0][1]):04}.csv"
print(out_file_name0)

./data/prompts_v04/aug_tweets_class0_v04prompt_0000_0199.csv


In [42]:
pt.write_aug_tweets(dict_aug_tweets, 0, out_file_name0)

True

In [43]:
df_aug_class1_chunk = df_aug_class1.iloc[tweet_batches_class1[0][0]:(tweet_batches_class1[0][1]+1)]

# generate 200 class 1 tweets
t0 = time.time()
rows_processed = 1
aug_tweet_ids = []
aug_tweets_texts = []
aug_tweets_targets = []
for i, row in df_aug_class1_chunk.iterrows():
    prompt_content = start_prompt_class1 + row['text']
    gen_tweet = pt.get_aug_tweet(context1, prompt_content)
    aug_tweets_texts.append(gen_tweet)
    aug_tweet_ids.append(row['id'])
    aug_tweets_targets.append(row['target'])
    if rows_processed % 10 == 0:
        print(f"processing row {rows_processed} with id {row['id']}")
    rows_processed += 1
t1 = time.time()

gen_tweet_count = 200
print(f"time to do {gen_tweet_count} is {(t1-t0)/60.} minutes")

processing row 10 with id 20015
processing row 20 with id 20066
processing row 30 with id 20098
processing row 40 with id 20126
processing row 50 with id 20139
processing row 60 with id 20208
processing row 70 with id 20222
processing row 80 with id 20244
processing row 90 with id 20262
processing row 100 with id 20289
processing row 110 with id 20334
processing row 120 with id 20381
processing row 130 with id 20442
processing row 140 with id 20519
processing row 150 with id 20568
processing row 160 with id 20588
processing row 170 with id 20635
processing row 180 with id 20660
processing row 190 with id 20681
processing row 200 with id 20712
time to do 200 is 5.935741257667542 minutes


In [44]:
dict_aug_tweets_class1 = {}
for i in range(0, len(aug_tweet_ids)):
    dict_aug_tweets_class1[aug_tweet_ids[i]] = aug_tweets_texts[i]

In [46]:
out_file_name1 = f"./data/prompts_v04/aug_tweets_class1_v04prompt_{tweet_batches_class1[0][0]:04}_{(tweet_batches_class1[0][1]):04}.csv"
print(out_file_name1)

pt.write_aug_tweets(dict_aug_tweets_class1, 1, out_file_name1)

./data/prompts_v04/aug_tweets_class1_v04prompt_0000_0199.csv


True

In [None]:
import pandas as pd
import projtools as pt
df_class1_mislabeled = pd.read_csv("./data/label_errors_in_train_data/class1_mislabeled_samples.csv")

current_prompt = pt.get_prompt_setup()
context = current_prompt['context']['text']
start_prompt_class1 = current_prompt['prefix_class1']['text']
print(context)
print(start_prompt_class1)

In [None]:
# generate augmented tweets for class 1 mislabeled
import time
t0 = time.time()
rows_processed = 1
aug_offest = 30000  # std offset 20000
aug_tweets_class1_mislabeled = {}
for i, row in df_class1_mislabeled.iterrows():
    prompt_content = start_prompt_class1 + row['text']
    gen_tweet = pt.get_aug_tweet(context, prompt_content)
    aug_id = row['id'] + aug_offest
    aug_tweets_class1_mislabeled[aug_id] = gen_tweet
    if rows_processed % 10 == 0:
        print(f"processing row {rows_processed} with id {row['id']}")
    rows_processed += 1
t1 = time.time()

In [None]:
gen_tweet_count = df_class1_mislabeled.shape[0]
print(f"time to do {gen_tweet_count} is {(t1-t0)/60.} minutes")
out_file_name_mislabeled = f"./data/aug_tweets_class1_v01prompt_mislabeled.csv"
pt.write_aug_tweets(aug_tweets_class1_mislabeled, 1, out_file_name_mislabeled)

In [None]:
import pickle

pickle_file = './data/first3_class1.pickle'
# Store data (serialize) - commented out because it's already be done so just need to read
with open(pickle_file, 'wb') as handle:
    pickle.dump(aug_responses, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Load data (deserialize)
with open(pickle_file, 'rb') as handle:
    aug_responses = pickle.load(handle)

# print(aug_responses == unserialized_data)
print(aug_responses)

In [None]:


print(aug_responses.keys())
for key in aug_responses.keys():
    cgpt_response = aug_responses[key].choices[0].message.content
    print(cgpt_response)