In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys

In [None]:
data_path = Path("data/processed/Sentiment_Analysis_Roberta.csv")
if not data_path.exists():
    raise FileNotFoundError(f"Expected CSV not found. Place file at data/processed/...")
df = pd.read_csv(data_path)

In [3]:
remove_patterns = [
    r'\[deleted\]',
    r'\[removed\]',
    r'\[ Removed by Reddit \]',
]
pattern = '|'.join(remove_patterns)
df_clean = df[~df['comment_body'].str.contains(pattern, case=False, na=False, regex=True)].copy()

In [4]:
sentiment_order = ['negative', 'neutral', 'positive']
df_clean['sentiment_label'] = pd.Categorical(df_clean['sentiment_label'], categories=sentiment_order, ordered=True)

In [5]:
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

In [None]:
# Create a summary dataframe for the proportions
source_sentiment_proportions = df_clean.groupby('keyword')['sentiment_label'].value_counts(normalize=True).unstack()
source_sentiment_proportions = source_sentiment_proportions.reindex(columns=sentiment_order)
# Plot a stacked bar chart
ax = source_sentiment_proportions.plot(kind='bar', stacked=True, figsize=(14, 8), colormap='RdYlGn')
plt.title('Composition of Sentiment by Energy Source', fontsize=16)
plt.xlabel('Energy Source', fontsize=14)
plt.ylabel('Proportion of Sentiment', fontsize=14)
plt.legend(title='Sentiment Type', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [7]:
# DataFrame that excludes purely neutral comments
df_opinions = df_clean[df_clean['sentiment_label'] != 'neutral']

In [None]:
# Relative proportion of POSITIVE opinions for each source
sentiment_counts = df_opinions.groupby(['keyword', 'sentiment_label'], observed=True).size().unstack(fill_value=0)
# sentiment_counts['pos_ratio'] = sentiment_counts['positive'] / (sentiment_counts['positive'] + sentiment_counts['negative'])
positive_ratio = sentiment_counts['positive'] / (sentiment_counts['positive'] + sentiment_counts['negative'])
# Ratio of Positive to Negative Opinions
plt.figure(figsize=(10, 6))
sns.barplot(x=sentiment_counts.index, y=positive_ratio, hue=sentiment_counts.index, legend=False, palette='viridis')
plt.axhline(0.5, color='red', linestyle='--', label='Neutral Opinion Line (50/50)')
plt.title('Ratio of Positive to Negative Opinions (Excluding Neutral Comments)')
plt.xlabel('Energy Source')
plt.ylabel('Percentage of Positive Opinions')
plt.legend()
plt.show()

hydro_percentage = positive_ratio['hydro'] * 100
solar_percentage = positive_ratio['solar'] * 100
wind_percentage = positive_ratio['wind'] * 100
print("Percentage of non-neutral comments that are positive:")
print("Hydropower: " + str(round(hydro_percentage,1)) + "%")
print("Solar: " + str(round(solar_percentage,1)) + "%")
print("Wind: " + str(round(wind_percentage,1)) + "%")