# Sentiment analysis using OpenAI

setup paths and common variables

In [None]:
!pip install openai pandas

In [None]:
import openai
import csv
import pandas as pd
import time
import random

check whether API key exists (should be in your environment variable)

In [None]:

# %%
# Initialize the OpenAI client with your API key
import os
api_key = os.getenv('OPENAI_API_KEY')  # Replace with your actual OpenAI API key
openai.api_key = api_key

# %% [markdown]
# Define the sentiment analysis prompt template

# %%
prompt_template = '''
Analyze the sentiment of the Hacker News comment with the context of the AI-related story title found below under "Input text:" whether the comment is negative (-1), neutral (0), or positive (1) towards AI. First understand the meaning of the story title, then understand the meaning of the comment in the context of the story title, then analyze the sentiment of the comments in the context of the story title whether the comment is negative (-1), neutral (0), or positive (1) towards AI, and conclude the sentiment of the comment to only one best sentiment type.

Sentiment can contain only a single number either:
  -1 (for negative towards AI)
  0 (for neutral towards AI)
  1 (for positive towards AI)

# Input text:
Story title: """
{title}
"""
Comment: """
{comment}
"""

# Output format:
Sentiment: your_sentiment
Explanation: your_explanation_for_the_sentiment
'''

In [None]:
# Function to get sentiment using OpenAI API
def get_sentiment_and_explanation(title, comment):
    prompt = prompt_template.format(title=title, comment=comment)

    for _ in range(3):  # Retry logic
        try:
            # Call the OpenAI API
            response = openai.ChatCompletion.create(
                model="gpt-4",  # Choose the appropriate model
                messages=[
                    {"role": "system", "content": "You are an expert at analyzing comments."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=150,  # Adjust max tokens if needed
                temperature=0.5
            )

            # Extract the response text
            result = response['choices'][0]['message']['content'].strip()

            # Extract sentiment and explanation from the response
            sentiment_line = next((line for line in result.split('\n') if "Sentiment:" in line), None)
            explanation_line = next((line for line in result.split('\n') if "Explanation:" in line), None)

            if sentiment_line and explanation_line:
                sentiment = int(sentiment_line.split(":")[1].strip())
                explanation = explanation_line.split(":")[1].strip()
                return sentiment, explanation
            else:
                raise ValueError("Invalid response format")

        except Exception as e:
            print(f"API call failed: {e}. Pausing for 1-2 minutes...")
            time.sleep(random.randint(60, 120))

# Function to process comments and save periodically
def process_comments(df, output_csv_file, save_interval=20):
    for i, row in df.iterrows():
        # Skip rows that already have a sentiment and explanation value
        if pd.notna(row.get('sentiment')) and pd.notna(row.get('explanation')):
            continue

        # Perform sentiment analysis and get the explanation
        sentiment, explanation = get_sentiment_and_explanation(row['title'], row['comment_text'])
        df.at[i, 'sentiment'] = sentiment
        df.at[i, 'explanation'] = explanation

        # Save progress to CSV after every `save_interval` rows
        if (i + 1) % save_interval == 0:
            df.to_csv(output_csv_file, index=False)
            print(f"Saved progress at row {i + 1}")

    # Final save at the end of processing
    df.to_csv(output_csv_file, index=False)
    print(f"Final result saved to {output_csv_file}")


Load the CSV file with the Hacker News data

In [None]:
csv_file = 'sentiment_385_sampled_cleanup.csv'  # Replace with the actual path to your CSV file
df = pd.read_csv(csv_file)

# If the 'sentiment' and 'explanation' columns don't exist, add them
if 'sentiment' not in df.columns:
    df['sentiment'] = None
if 'explanation' not in df.columns:
    df['explanation'] = None

# Output file to save the results
output_csv_file = 'hackernews_comments_with_sentiment_OPENAI.csv'

# Process the comments and periodically save the results
process_comments(df, output_csv_file)

print(f'Sentiment analysis completed and saved to {output_csv_file}')