In [None]:
import openai
import os
import pandas as pd
import random
import tiktoken
import time

from dotenv import load_dotenv
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
train = pd.read_csv("data/processed/train.csv")
test = pd.read_csv("data/processed/test.csv")

# Create prompts

for prompting:

Here is an example of a positive movie review, associated with a rating of 10:
...

Here is an example of a negative movie review, associated with a rating of 2:
...

What about the following review? Was its author feeling positive or negative about the movie? Answer with a single word.

In [None]:
# Review length
train['nb_words'] = train['sentiment'].apply(lambda x: len(x.split()))

In [None]:
prompts = []

# Iterate over the test dataset
for _, test_row in tqdm(test.iterrows(),
                        total=test.shape[0],
                        desc = "Generating Prompts",
                        unit = "prompts"):
    
    # Select a random positive and negative sentiment review from train
    positive_row = random.choice(
        train[(train['sentiment'] == 'positive') & (train['nb_words'] < 100)].to_dict('records'))
    negative_row = random.choice(
        train[(train['sentiment'] == 'negative') & (train['nb_words'] < 100)].to_dict('records'))

    # Randomize the order of positive and negative examples
    examples = random.sample(
        [(f"Here is an example of a positive movie review, associated with a rating of {positive_row['rating']}:\n{positive_row['text']}",
          f"Here is an example of a negative movie review, associated with a rating of {negative_row['rating']}:\n{negative_row['text']}"),
         (f"Here is an example of a negative movie review, associated with a rating of {negative_row['rating']}:\n{negative_row['text']}",
          f"Here is an example of a positive movie review, associated with a rating of {positive_row['rating']}:\n{positive_row['text']}")
        ], 1)[0]

    # Construct the prompt
    prompt = f"""
    {examples[0]}

    {examples[1]}

    What about the following review? Was its author feeling positive or negative about the movie? Answer with a single word.
    {test_row['text']}
    """
    
    prompts.append({
        'review_id': test_row['review_id'],
        'prompt': prompt
    })
    
prompts = pd.DataFrame(prompts)

In [None]:
# Save
prompts.to_csv("data/processed/GTP_prompts.csv, index=False")

# Estimate cost

In [None]:
# Tokenize with tiktoken
enc = tiktoken.get_encoding("cl100k_base")
prompts['tokens'] = prompts['prompt'].apply(lambda x: len(enc.encode(x)))
total_tokens = prompts['tokens'].sum()
print(f"Total number of tokens: {total_tokens}")

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(prompts['tokens'], bins=30, color='cornflowerblue', edgecolor='midnightblue')
plt.title('Distribution of Token Counts')
plt.xlabel('Number of Tokens')
plt.ylabel('')
plt.show()

In [None]:
gpt_4o = (total_tokens * 2.5 + len(prompts) * 1.25) * 1e-6
gpt_35_turbo = (total_tokens * 0.5 + len(prompts) * 1.5) * 1e-6
gpt_4o_mini = (total_tokens * 0.15 + len(prompts) * 0.075) * 1e-6

print(f"Cost with gpt-4o: {gpt_4o:.2f} $")
print(f"Cost with gpt-3.5-turbo: {gpt_35_turbo:.2f} $")
print(f"Cost with gpt-4o-mini: {gpt_4o_mini:.2f} $")

# Send prompts

In [None]:
prompts = pd.read_csv("data/processed/GTP_prompts.csv")

In [None]:
load_dotenv()
openai.api_key = os.getenv('OPENAI_API_KEY')

In [None]:
client = OpenAI()

def get_openai_answer(review_id, df):
    row = df[df['review_id'] == review_id]
    prompt = row['prompt'].iloc[0]
    
    try:
        completion = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}]
        )
        
        return completion.choices[0].message

    except Exception as e:
        print(f"Error: {e} for review #{review_id}")
        return None

In [None]:
answer = get_openai_answer(1, prompts)

In [None]:
# process in batches for prompts without answers
# do not retry in case of failure
# start over

In [None]:
results.to_csv("output/GTP.csv")