In [None]:
import os
import openai
import pandas as pd
import numpy as np
import time

In [None]:
df = pd.read_csv('unlabel_tweets.csv')
df['label'] = -1 # Default value if not yet predicted
df['stance'] = 'NA'
df.head()

In [None]:
from nltk.tokenize import TweetTokenizer

#clean @users, url, #characters in tweets
tokenizer = TweetTokenizer(preserve_case=True, reduce_len=True, strip_handles=False)

def text_preprocess(text, keep_hashtags=True):
    toks = tokenizer.tokenize(text)

    ret = []
    for tok in toks:
        if tok[0] == "#" and not keep_hashtags:
            continue
        if tok[:4] == "http":
            continue
        if tok[0] == "@":
            continue
        ret.append(tok)
    return " ".join(ret)

In [None]:
df['content'] = df['content'].astype('str')
df['clean_content'] = df['content'].apply(lambda x: text_preprocess(x))
df.head()

In [None]:
openai.api_key = "xxx" #your API key here

#prompt for gpt
prompt = "Is this a pro-abortion, anti-abortion, neutral, or off-topic statement? Only say it is neutral when it is completely non-opinionated:"
model = "gpt-3.5-turbo"
# Valid topics are: 'abortion', 'gun', 'vaccine'
topic = 'abortion'
# The begin and end indexes of the tweets to run
range_start = 0
range_end = len(df.clean_content)
print_interval = 2000


# Advanced settings
output_file_prefix = 'labled_tweets_'
output_dir = "chatgpt_output"
!mkdir {output_dir}
save_interval = 2000 # Changing this means you have to restart the process!!!
error_dump_interval = 100
error_dump_file_name = 'error_dump_'+topic+'.txt'
!touch {error_dump_file_name}

In [None]:
# Get previous run progress
# THIS IS OPTIONAL TO RUN. ONLY RUN THIS IF YOU WANT TO RESUME A PREVIOUSLY STOPPED KERNEL
import os
directory = output_dir
prefix = output_file_prefix+topic+'_'
max_index = 0
for filename in sorted(os.listdir(directory)):
    f = os.path.join(directory, filename)
    if os.path.isfile(f) and prefix in filename:
        truncated = filename.replace(prefix, '')
        truncated = truncated.replace('.csv', '')
        truncated = truncated[truncated.find('-')+1:]
        num = int(truncated)
        if num > max_index:
            max_index = num
range_start = max_index
print('Processing range: ', range_start, '-', range_end)

In [None]:
# This is restartable. If it crashes or gets rate limited, you can run it again and it will resume where it left off.
def abortionParser(text):
    if 'neutral' in text:
        return 2
    if 'off' in text:
        return 3
    if 'pro-life' in text or 'anti-abortion' in text or 'anti-choice' in text:
        return 1
    if 'pro-abortion' in text or 'anti-life' in text or 'pro-choice' in text:
        return 0
    return -2
def gunParser(text):
    if 'neutral' in text:
        return 2
    if 'off' in text:
        return 3
    if 'pro' in text:
        return 0
    elif 'anti' in text:
        return 1
    else:
        return -2
def vaccineParser(text):
    if 'neutral' in text:
        return 2
    if 'off' in text:
        return 3
    if 'pro' in text:
        return 0
    elif 'anti' in text:
        return 1
    else:
        return -2

rate_limit_count = 0
errors = []
pd.options.mode.chained_assignment = None  # default='warn'
for i in range(range_start,range_end):
    if df['label'][i] != -1: #Only predict if the quote does not have a label yet
        continue
    quote = df['clean_content'][i]
    response = None
    while response == None:
        try:
            response = openai.ChatCompletion.create(
                model=model,
                messages=[{"role": "user", "content": prompt + "\n'" + str(quote) + "'"}],
                temperature=0,
                max_tokens=8)
        except Exception as error:
            if 'overloaded' in str(error):
                rate_limit_count += 1
                time.sleep(0.05) # servers are busy, wait briefly and try again
            else:
                print('ERROR encountered:',error)
                time.sleep(0.5) # Account is rate limited. Wait slightly longer and try again

    text = response['choices'][0]['message']['content'].lower()
    label = -1
    if topic == 'gun':
        label = gunParser(text)
    elif topic == 'abortion':
        label = abortionParser(text)
    else:
        label = vaccineParser(text)
    df['label'][i] = label
    if label == 0:
        df['stance'][i] ='FAVOR'
    elif label == 1:
        df['stance'][i] ='AGAINST'
    elif label == 2:
        df['stance'][i] ='NEUTRAL'
    elif label == 3:
        df['stance'][i] ='OFF'
    else:
        errors.append((i, text, quote))
        df['stance'][i] ='ERROR'
    if i % print_interval == 0: #print just for you to see if it is working
        print('Progress:', i, 'Rate limits encountered this chunk:', rate_limit_count)
        rate_limit_count = 0
    if i % save_interval == 0 and i != 0:
        #Save the df to a csv file to prevent us loosing progress
        begin = i-save_interval
        end = i
        df[begin:end].to_csv(output_dir + '/'+output_file_prefix+topic+'_'+str(begin)+'-'+str(i)+'.csv', index=True)
    if i % error_dump_interval == 0 and i != 0:
        #Save the df to a csv file to prevent us loosing progress
        file = open(error_dump_file_name, 'a')
        for e in errors:
            file.write(str(e)+'/n')
        file.close()
        errors = []
# Finished!!!
length = len(df.clean_content)
begin = length-(length % save_interval)
df[begin:].to_csv(output_dir + '/'+output_file_prefix+topic+'_'+str(begin)+'-'+str(length)+'.csv', index=True)

    # time.sleep(0.01) # Prevent rate limiting. When using paid version, this is not needed