In [None]:
import pandas as pd
import re
import contractions
import emoji
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import pipeline

posts = pd.read_csv("reddit_posts_batch_500_20250421_190157.csv")
comments = pd.read_csv("reddit_comments_batch_500_20250421_190157.csv")

# standardise and combine
posts["source"] = "post"
comments["source"] = "comment"
comments = comments.rename(columns={"comment_body": "text"})  # unify text column

combined = pd.concat([
    posts[["text", "source"]],
    comments[["text", "source"]]
], ignore_index=True)

combined.dropna(subset=["text"], inplace=True)
combined["text"] = combined["text"].astype(str)

def clean_text(text):
    expanded = contractions.fix(text)
    cleaned = re.sub(r"http\S+|www\S+|@\w+|#\w+", "", expanded)
    cleaned = re.sub(r'[\r\n]+', '', cleaned)
    emoji_chars = set(emoji.EMOJI_DATA.keys())
    pattern = r"[^\w\s" + re.escape("".join(emoji_chars)) + "]"
    cleaned = re.sub(pattern, '', cleaned)
    return re.sub(r"\s+", " ", cleaned).strip().lower()

def sentiment_analysis(df, text_col="text"):
    df['cleaned_text'] = df[text_col].apply(clean_text)
    df = df[df['cleaned_text'].str.strip().ne('')]

    vader = SentimentIntensityAnalyzer()
    vader_results = df['cleaned_text'].apply(vader.polarity_scores).apply(pd.Series)
    df = pd.concat([df, vader_results], axis=1)

    df['vader_scaled'] = (df['compound'] + 1) / 2
    df['needs_verify'] = (df['vader_scaled'] >= 0.3) & (df['vader_scaled'] <= 0.7)
    df['model_source'] = 'vader'

    df['final_sentiment'] = np.select(
        [
            df['vader_scaled'] >= 0.7,
            df['vader_scaled'] <= 0.3
        ],
        ['positive', 'negative'],
        default='neutral'
    )

    # RoBERTa only for neutral cases
    if df['needs_verify'].any():
        roberta_pipe = pipeline(
            "sentiment-analysis",
            model="cardiffnlp/twitter-roberta-base-sentiment",
            device=-1
        )

        mask = df['needs_verify']
        try:
            roberta_texts = df.loc[mask, 'cleaned_text'].tolist()
            results = roberta_pipe(roberta_texts, batch_size=8)

            roberta_scores = []
            for result in results:
                score_dict = {item['label']: item['score'] for item in result}
                scaled = (score_dict['LABEL_2'] - score_dict['LABEL_0'] + 1) / 2
                roberta_scores.append(scaled)
        except Exception as e:
            print(f"RoBERTa error: {e}")
            roberta_scores = [np.nan] * mask.sum()

        df.loc[mask, 'roberta_score'] = roberta_scores
        df['final_score'] = df['vader_scaled']
        df.loc[mask, 'final_score'] = 0.4 * df['vader_scaled'] + 0.6 * df['roberta_score']

        df['final_sentiment'] = np.select(
            [
                df['final_score'] >= 0.6,
                df['final_score'] <= 0.4,
                df['final_score'].isna()
            ],
            ['positive', 'negative', 'neutral'],
            default='neutral'
        )

        df.loc[mask, 'model_source'] = 'Combination'

    return df

# run analysis
results = sentiment_analysis(combined)
results.to_csv("sentiment_results.csv", index=False)

# see results
print(results[['cleaned_text', 'final_sentiment', 'model_source']].head())


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cpu


RoBERTa error: The expanded size of the tensor (569) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [8, 569].  Tensor sizes: [1, 514]
                                        cleaned_text final_sentiment  \
0  in an lbw it is either hitting the stumps or n...         neutral   
1  like rinku ashutosh shashank sharukh khan nama...         neutral   
3  kkr released gill for venky and gill scored 90...        positive   
4  i have not seen his performance in half of the...        negative   
5  what happened today tewatia duck so he is proa...         neutral   

  model_source  
0  Combination  
1  Combination  
3        vader  
4        vader  
5  Combination  
