<a href="https://colab.research.google.com/github/JamesMartinOU/PublicRedditSentimentAnalysis/blob/main/RedditCommentsSentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install Python libraries
!pip install transformers torch emoji
!pip install mysql-connector-python

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvid

In [None]:
# Import Python libraries
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import emoji
import pandas as pd
import mysql.connector
import warnings
from google.colab import files
import time
import openpyxl
from google.colab import auth
import gspread
from google.colab import files
from google.auth import default
from sqlalchemy import create_engine

In [None]:
# RDS MySQL connection details

In [None]:
# Function preprocess - removes emojies and cleans text input
def preprocess(text):
    return emoji.demojize(text).replace('\n', ' ').strip()

In [None]:
# Function split_text - segments an input to meet tokenizer limit
def split_text(text, chunk_size=450):
    """
    Splits a long string into chunks of `chunk_size` characters.
    """
    text = preprocess(text)
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

In [None]:
# Load Twitter sentiment model
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# MySQL connection
conn = mysql.connector.connect(
    host=DB_HOST,
    user=DB_USER,
    password=DB_PASSWORD,
    database=DB_NAME
)
cursor = conn.cursor()

# Import all data from reddit_comments
cursor.execute("SELECT * FROM reddit_comments;")
columns = [desc[0] for desc in cursor.description]
data = cursor.fetchall()

# Write reddit_comments query result to df
df = pd.DataFrame(data, columns=columns)
# Segment just the first 100000 comments
df = df.iloc[899999:1000000].reset_index(drop=True)

# Close connection
cursor.close()
conn.close()

# Create results to store sentiment data
results = []

for idx, row in df.iterrows():
    post_id = row['post_id']
    comment_id = row['comment_id']
    body = row['body']

    if pd.isna(body):
        continue

    chunks = split_text(body)

    for i, chunk in enumerate(chunks):
        try:
            result = sentiment_pipeline(chunk, truncation=True, max_length=512)[0]
            label_map = {
                'LABEL_0': 'Negative',
                'LABEL_1': 'Neutral',
                'LABEL_2': 'Positive'
            }
            sentiment = label_map[result['label']]
            score = round(result['score'], 4)
            results.append({
                'post_id': post_id,
                'comment_id': comment_id,
                'segment': i + 1,
                'body_segment': chunk,
                'sentiment': sentiment,
                'confidence': score
            })
        except Exception as e:
            print(f"Error processing comment {comment_id} segment {i+1}: {e}")

# Convert to DataFrame
sentiment_df = pd.DataFrame(results)
# Save dataframe locally
sentiment_df.to_csv("reddit_sentiment_results.csv", index=False)
files.download("reddit_sentiment_results.csv")
# Preview
print(sentiment_df.head())

Device set to use cuda:0


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

   post_id comment_id  segment  \
0  1iskfi0    mdhr1s8        1   
1  1ismseo    mdhrniv        1   
2  1ismseo    mdhrt06        1   
3  1d7hkfq    mdhspu3        1   
4  1ismseo    mdhu4vj        1   

                                        body_segment sentiment  confidence  
0  What is more concerning is how silent the boar...  Negative      0.8220  
1  Yes, I'm short Tesla, yes I want attention so ...   Neutral      0.4756  
2  If you wanna go short on anything you need som...   Neutral      0.8001  
3  I know what you mean.  I, to point out what yo...  Negative      0.4574  
4  “I bought a single way OTM put and I want copi...   Neutral      0.7890  


In [None]:
# Write sentiment_df to reddit_comments_sentiment table
db_url = f"mysql+mysqlconnector://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_NAME}"
engine = create_engine(db_url)
sentiment_df.to_sql(name='reddit_comments_sentiment', con=engine, if_exists='replace', index=False)
print("Sentiment results written to 'reddit_comments_sentiment' table.")

In [None]:
# Query result set
db_url = f"mysql+mysqlconnector://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_NAME}"
engine = create_engine(db_url)
# Run query and load into DataFrame
query = "SELECT * FROM reddit_comments_sentiment;"
df_sentiment = pd.read_sql(query, engine)
# Preview result
print(df_sentiment.head())

   post_id comment_id  segment  \
0  10nczjx    j69rt1h        1   
1  10nczjx    j69rtsw        1   
2  10nczjx    j69rtz9        1   
3  10nczjx    j69rucn        1   
4  10nbau4    j69rwbw        1   

                                        body_segment sentiment  confidence  
0                                       Supersize Me   Neutral      0.7779  
1  Market pays what it needs to pay  If they cant...  Negative      0.7331  
2  So that's a fairly large caveat saying McDonal...   Neutral      0.6132  
3  Fuck those jobs then. Excited society doesn't ...  Negative      0.8726  
4  Media members aren't allowed to place bets on ...  Negative      0.7382  
