<a href="https://colab.research.google.com/github/KatrinaH92/git_test/blob/main/Pres_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

plt.style.use('ggplot')

import nltk

In [None]:
presidents = pd.read_csv(r'C:\Users\katri\OneDrive\Desktop\US presidents\us_election_2020_1st_presidential_debate.csv')
print(presidents.shape)


In [None]:
presidents.head()

In [None]:
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
# Add sentiment columns to the DataFrame
tqdm.pandas()  # Enable progress bar for pandas operations

# Define a function to calculate sentiment for a given text
def calculate_sentiment(text):
    sentiment = sia.polarity_scores(text)
    return sentiment['neg'], sentiment['neu'], sentiment['pos'], sentiment['compound']

# Apply the function to the 'text' column and create new columns
presidents[['neg', 'neu', 'pos', 'compound']] = presidents['text'].progress_apply(
    lambda x: calculate_sentiment(x)
).apply(pd.Series)

# Inspect the updated DataFrame
presidents.head()

In [None]:
filtered_presidents = presidents[presidents['speaker'].isin(['Vice President Joe Biden', 'President Donald J. Trump'])]
ax = sns.barplot(data=filtered_presidents, x='speaker', y='compound')
ax.set_title('Compound Score by Speaker')
plt.show()

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(12, 3))
sns.barplot(data=filtered_presidents, x='speaker', y='pos', ax=axs[0])
sns.barplot(data=filtered_presidents, x='speaker', y='neu', ax=axs[1])
sns.barplot(data=filtered_presidents, x='speaker', y='neg', ax=axs[2])
axs[0].set_title('Positive')
axs[1].set_title('Neutral')
axs[2].set_title('Negative')
plt.tight_layout()
plt.show()

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)  # Apply softmax to normalize scores
    scores_dict = {
        'roberta_neg': scores[0],
        'roberta_neu': scores[1],
        'roberta_pos': scores[2]
    }
    return scores_dict

In [None]:
res = []

for i, row in tqdm(presidents.iterrows(), total=len(presidents)):
    try:
        text = row['text']
        speaker = row['speaker']

        # Sentiment analysis using VADER (assuming 'sia' is defined)
        pres_result = sia.polarity_scores(text)

        # Sentiment analysis using Roberta
        roberta_result = polarity_scores_roberta(text)

        # Store results
        res.append({
            'speaker': speaker,
            'text': text,
            'sia_positive': pres_result['pos'],
            'sia_negative': pres_result['neg'],
            'sia_neutral': pres_result['neu'],
            'sia_compound': pres_result['compound'],
            'roberta_positive': roberta_result['roberta_pos'],  # Fixing the key
            'roberta_negative': roberta_result['roberta_neg'],  # Fixing the key
            'roberta_neutral': roberta_result['roberta_neu'],   # Fixing the key
            'roberta_compound': roberta_result['roberta_pos'] - roberta_result['roberta_neg']  # Example for compound score
        })

    except RuntimeError:
        print(f'Broke for id {i} with text: {text[:50]}')  # Display part of the problematic text for debugging

In [None]:
results_df = pd.DataFrame(res)

# Ensure the column 'speaker' exists
if 'speaker' not in results_df.columns:
    print("Error: 'speaker' column is missing from results_df")

# Filter for Biden and Trump
filtered_df = results_df[results_df['speaker'].isin(['Vice President Joe Biden', 'President Donald J. Trump'])]

# Inspect the filtered results
filtered_df.head()

In [None]:
ax = sns.barplot(data=filtered_df, x='speaker', y='roberta_compound')
ax.set_title('Compound Score by Speaker')
plt.show()

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(12, 3))
sns.barplot(data=filtered_df, x='speaker', y='roberta_positive', ax=axs[0])
sns.barplot(data=filtered_df, x='speaker', y='roberta_neutral', ax=axs[1])
sns.barplot(data=filtered_df, x='speaker', y='roberta_negative', ax=axs[2])
axs[0].set_title('Positive')
axs[1].set_title('Neutral')
axs[2].set_title('Negative')
plt.tight_layout()
plt.show()