In [1]:
# Install required libraries (run this in Colab)
!pip install transformers datasets scikit-learn pandas

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K  

In [2]:
# Import necessary libraries
import pandas as pd
from transformers import pipeline
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

In [7]:
# Load sentiment analysis model (Hugging Face BERT)
classifier = pipeline('sentiment-analysis', model='nlptown/bert-base-multilingual-uncased-sentiment', truncation=True)



In [4]:
# Load the dataset (update with the correct file path in your Colab environment)
df = pd.read_csv('sentiment_385_sampled_cleanup.csv')

In [8]:
# Function to apply the BERT model with error handling
def classify_text(text, index):
    try:
        # BERT can process up to 512 tokens, set truncation to True to handle longer texts
        result = classifier(text)[0]['label']
        return result
    except Exception as e:
        print(f"Error at index {index}: {e}")
        print(f"Problematic text: {text}")
        return None  # Return None if there's an error

# Apply the BERT model to the comment_text column with error checking
df['predicted_sentiment'] = [classify_text(text, idx) for idx, text in enumerate(df['comment_text'])]

# Drop rows where sentiment prediction failed (optional)
df = df.dropna(subset=['predicted_sentiment'])

# Map the predicted sentiment (1-5 stars) to ternary format (-1, 0, 1)
def map_bert_to_custom(sentiment_label):
    if sentiment_label in ['1 star', '2 stars']:
        return -1  # Negative
    elif sentiment_label == '3 stars':
        return 0  # Neutral
    elif sentiment_label in ['4 stars', '5 stars']:
        return 1  # Positive

df['predicted_sentiment_mapped'] = df['predicted_sentiment'].apply(map_bert_to_custom)

# Ensure consensus_agreement is ternary (1 = positive, 0 = neutral, -1 = negative)
df['consensus_agreement_ternary'] = df['consensus_agreement'].apply(lambda x: int(x))

# Filter out rows where the model can't predict neutral (optional, if you want to exclude neutrals)
filtered_df = df[df['consensus_agreement_ternary'] != 0]

# Calculate evaluation metrics only for rows that have positive or negative sentiment
y_true = filtered_df['consensus_agreement_ternary']
y_pred = filtered_df['predicted_sentiment_mapped']

# F1 Score
f1 = f1_score(y_true, y_pred, average='weighted')  # weighted to handle class imbalance
# Precision
precision = precision_score(y_true, y_pred, average='weighted')
# Recall
recall = recall_score(y_true, y_pred, average='weighted')
# Accuracy
accuracy = accuracy_score(y_true, y_pred)

# Display the results
print(f"F1 Score: {f1:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"Accuracy: {accuracy:.2f}")

# Export the DataFrame to a new CSV with predicted sentiment included
output_csv = 'sentiment_analysis_with_predictions.csv'
df.to_csv(output_csv, index=False)

print(f"Results saved to {output_csv}")

F1 Score: 0.74
Precision: 0.81
Recall: 0.68
Accuracy: 0.68
Results saved to sentiment_analysis_with_predictions.csv


  _warn_prf(average, modifier, msg_start, len(result))
