In [1]:
# Install required libraries (run this in Colab)
!pip install stanza scikit-learn pandas

Collecting stanza
  Downloading stanza-1.9.2-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.13.0-py3-none-any.whl.metadata (5.8 kB)
Downloading stanza-1.9.2-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.13.0-py3-none-any.whl (553 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m553.2/553.2 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji, stanza
Successfully installed emoji-2.13.0 stanza-1.9.2


In [3]:
# Import necessary libraries
import pandas as pd
import stanza
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

In [4]:
# Download the sentiment model for English
stanza.download('en')  # Make sure to download the appropriate language model
nlp = stanza.Pipeline('en', processors='tokenize,sentiment')


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.9.0/models/default.zip:   0%|          | 0…

INFO:stanza:Downloaded file to /root/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor | Package        |
------------------------------
| tokenize  | combined       |
| mwt       | combined       |
| sentiment | sstplus_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: mwt
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: sentiment
  checkpoint = torch.load(filename, lambda storage, loc: storage)
  state = torch.load(filename, lambda storage, loc: storage)
  data = torch.load(self.filename, lambda storage, loc: storage)
INFO:stanza:Done loading processors!


In [5]:
# Load the dataset (update with the correct file path in your Colab environment)
df = pd.read_csv('sentiment_385_sampled_cleanup.csv')


In [6]:
# Function to apply Stanza sentiment analysis with error handling
def classify_text_stanza(text, index):
    try:
        # Apply Stanza sentiment analysis
        doc = nlp(text)
        # Stanza's sentiment values are directly -1, 0, 1
        sentiment = doc.sentences[0].sentiment  # Take sentiment of the first sentence
        return sentiment  # No need for mapping, as Stanza already uses -1, 0, 1
    except Exception as e:
        print(f"Error at index {index}: {e}")
        print(f"Problematic text: {text}")
        return None  # Return None if there's an error

# Apply the Stanza model to the comment_text column with error checking
df['predicted_sentiment'] = [classify_text_stanza(text, idx) for idx, text in enumerate(df['comment_text'])]

# Drop rows where sentiment prediction failed (optional)
df = df.dropna(subset=['predicted_sentiment'])

# Ensure consensus_agreement is ternary (1 = positive, 0 = neutral, -1 = negative)
df['consensus_agreement_ternary'] = df['consensus_agreement'].apply(lambda x: int(x))

# Filter out rows where the model can't predict neutral (optional, if you want to exclude neutrals)
filtered_df = df[df['consensus_agreement_ternary'] != 0]

# Calculate evaluation metrics only for rows that have positive or negative sentiment
y_true = filtered_df['consensus_agreement_ternary']
y_pred = filtered_df['predicted_sentiment']

# F1 Score
f1 = f1_score(y_true, y_pred, average='weighted')  # weighted to handle class imbalance
# Precision
precision = precision_score(y_true, y_pred, average='weighted')
# Recall
recall = recall_score(y_true, y_pred, average='weighted')
# Accuracy
accuracy = accuracy_score(y_true, y_pred)

# Display the results
print(f"F1 Score: {f1:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"Accuracy: {accuracy:.2f}")

# Export the DataFrame to a new CSV with predicted sentiment included
output_csv = 'stanza_sentiment_analysis_with_predictions.csv'
df.to_csv(output_csv, index=False)

print(f"Results saved to {output_csv}")

F1 Score: 0.27
Precision: 0.28
Recall: 0.26
Accuracy: 0.26
Results saved to stanza_sentiment_analysis_with_predictions.csv


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
