### Import Libraries

In [None]:
import os
import glob
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

In [None]:
# Merge all CSV files in 'Opinions' into one DataFrame with mandatory 'content' field
opinions_dir = os.path.join(os.getcwd(), 'Opinions')
pattern = os.path.join(opinions_dir, '*.csv')
files = glob.glob(pattern)

dfs = []
for f in files:
    try:
        df = pd.read_csv(f, dtype=str, encoding='utf-8', engine='python')
    except Exception as e:
        print(f'Warning: failed to read {f}: {e}')
        continue

    # normalize column names
    df.columns = [str(c).strip().lower() for c in df.columns]

    # candidates
    opinion_candidates = ['opinion', 'content', 'text', 'review', 'body', 'comment']
    sentiment_candidates = ['sentiment', 'label', 'polarity']
    score_candidates = ['score', 'sentiment_score', 'compound', 'rating', 'rating_score']

    opinion_col = next((c for c in opinion_candidates if c in df.columns), None)
    sentiment_col = next((c for c in sentiment_candidates if c in df.columns), None)
    score_col = next((c for c in score_candidates if c in df.columns), None)

    # fallback: use first column as opinion if nothing matched
    if opinion_col is None and len(df.columns) > 0:
        opinion_col = df.columns[0]

    if opinion_col is None:
        # no usable columns in this file
        continue

    # build normalized frame with requested columns
    out = pd.DataFrame()
    out['opinion'] = df[opinion_col].astype(str).str.strip()
    out['sentiment'] = df[sentiment_col] if sentiment_col in df.columns else pd.NA
    out['score'] = df[score_col] if score_col in df.columns else pd.NA

    # drop empty opinions
    out = out[out['opinion'].notna() & (out['opinion'] != '')].copy()

    # coerce score to numeric where possible
    out['score'] = pd.to_numeric(out['score'], errors='coerce')

    dfs.append(out)

if dfs:
    consolidated = pd.concat(dfs, ignore_index=True)
else:
    consolidated = pd.DataFrame(columns=['opinion', 'sentiment', 'score'])
consolidated

In [None]:
opinions = consolidated[['opinion', 'score', 'sentiment']].copy()
opinions.sentiment=opinions.sentiment.astype(str).str.lower()
opinions

In [None]:
sa = SentimentIntensityAnalyzer()
opinions['vader'] = opinions.opinion.apply(lambda o: sa.polarity_scores(str(o)))
opinions

In [None]:
sentiment = pd.concat([opinions, opinions['vader'].apply(pd.Series)], axis=1).drop(columns=['vader'])
sentiment

In [None]:
sentiment['vader_sentiment'] = sentiment.apply(
    lambda row: 'positive' if row['compound'] >= 0.05 else ('negative' if row['compound'] <= -0.05 else 'neutral'), axis=1
)
sentiment

In [None]:
sentiment["sentiment"].value_counts()

In [None]:
misclassified = confusion_matrix(sentiment['sentiment'].astype(str), sentiment['vader_sentiment'].astype(str), labels=['positive', 'neutral', 'negative']   )
misclassified

In [None]:
ConfusionMatrixDisplay(misclassified, display_labels=['positive', 'neutral', 'negative']).plot()

In [None]:
raport = classification_report(sentiment['sentiment'].astype(str), sentiment['vader_sentiment'].astype(str), labels=['positive', 'neutral', 'negative'], output_dict=True)

In [None]:
raport_df = pd.DataFrame(raport).transpose()

### Save results

In [None]:
sentiment.to_csv('./sentiment_analysis_vader.csv', index=False, encoding='utf-8')
raport_df.to_csv('./classification_vader_report.csv', index=True, encoding='utf-8')
misclassified_df = pd.DataFrame(misclassified, index=['actual_positive', 'actual_neutral', 'actual_negative'], columns=['predicted_positive', 'predicted_neutral', 'predicted_negative']).to_csv('./confusion_matrix_vader.csv', index=True, encoding='utf-8')  
ConfusionMatrixDisplay(misclassified, display_labels=['positive', 'neutral', 'negative']).plot().figure_.savefig('./confusion_matrix_vader.png')