In [9]:
import pandas as pd
import re
from transformers import GPT2Tokenizer, GPT2LMHeadModel, pipeline

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Set the padding token to eos_token since GPT-2 does not have a default padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

ModuleNotFoundError: No module named 'pandas'

In [2]:
df = pd.read_csv("/Users/juliuselemans/Downloads/IMDB Dataset.csv")
def remove_non_alphanumeric(text):
    return re.sub(r'[^a-zA-Z0-9]', ' ', text)

df['review'] = df['review'].apply(remove_non_alphanumeric)

In [None]:
# Display sample of preprocessed reviews
print("Sample preprocessed reviews:")
print(df)

In [None]:
# Function to calculate token length for each review
def token_length(text):
    tokens = tokenizer.encode(text)
    return len(tokens)

# Apply function to the 'prompt' column
df_token_length['token_length'] = df['review'].apply(token_length)

# Save the dataframe with the token lengths to a new CSV
df_token_length.to_csv("token_length_data.csv", index=False)

In [5]:
def get_sentiment(text):
    # Directly asking for a one-word label (positive or negative) to improve clarity
    prompt = f" Please classify the sentiment of this review in one word: positive or negative. Review: '{text}'."
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
    outputs = model.generate(
        **inputs,
        max_length=200,  # Keeping it concise to focus on getting just the sentiment label
        pad_token_id=tokenizer.pad_token_id,
        #temperature=0.3,  # Lower temperature to encourage more predictable outputs
        #top_p=0.8,  # Narrow the choice of possible tokens to increase likelihood of desired output
        #no_repeat_ngram_size=2
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [None]:
# Test the sentiment analysis function with a sample review
test_review = "This movie was an excellent portrayal of historical events."
result = get_sentiment(test_review) 
print("Sentiment Prediction:", result)

In [None]:
# Apply sentiment analysis and display some responses
df['model_response'] = df['prompt'].apply(get_sentiment)
print("Sample model responses:")
print(df[['prompt', 'model_response']].head(5))


In [33]:
# Define a function to directly use model's numeric output as sentiment
def interpret_response(response):
    try:
        sentiment_score = int(response)  # Convert response to integer
        if sentiment_score == 1 or sentiment_score == -1:
            return sentiment_score
    except ValueError:
        pass
    return 0  # Return 0 for any non-numeric or unexpected output

# Apply the interpretation function to model responses
df['predicted_sentiment'] = df['model_response'].apply(interpret_response)

In [None]:
# Import necessary metrics calculation library
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assuming 'completion' contains 'positive' or 'negative', map these to numeric values
df['actual_sentiment'] = df['completion'].replace({'positive': 1, 'negative': -1})

# Calculate metrics
accuracy = accuracy_score(df['actual_sentiment'], df['predicted_sentiment'])
precision = precision_score(df['actual_sentiment'], df['predicted_sentiment'], pos_label=1)
recall = recall_score(df['actual_sentiment'], df['predicted_sentiment'], pos_label=1)
f1 = f1_score(df['actual_sentiment'], df['predicted_sentiment'], pos_label=1)

# Print the calculated metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

In [None]:
# Create a CSV file with all model responses, prompts, and predicted sentiments
df.to_csv('model_responses_and_sentiments.csv', index=False)
print("Data has been written to 'model_responses_and_sentiments.csv'")