In [11]:
import praw
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
from tqdm import tqdm
from datetime import datetime
import time
import os
    
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))


True
NVIDIA GeForce RTX 3060 Ti


In [14]:
# Config

DEVICE = 0 if torch.cuda.is_available() else -1
YEARS = list(range(2020, 2025))
SUBREDDITS = [
    'dota2', 'valorant', 'leagueoflegends', 'overwatch', 'fortnite',
    'Genshin_Impact', 'assasinscreed', 'amongus', 'minecraft', 'monsterhunter'
]
POST_LIMIT = 100
BATCH_SIZE = 32

In [None]:
# Load HuggingFace Pipelines (GPU-enabled)
twitter_roberta_sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment",
    tokenizer="cardiffnlp/twitter-roberta-base-sentiment",
    device=DEVICE
)

emotion_pipeline = pipeline(
    "text-classification",
    model="j-hartmann/emotion-english-distilroberta-base",
    tokenizer="j-hartmann/emotion-english-distilroberta-base",
    top_k=1,
    device=DEVICE
)

Device set to use cuda:0
Device set to use cuda:0


In [15]:
# Reddit auth
reddit = praw.Reddit(
    client_id='q_dYyqYYdNNInGsM-lC9Xg',
    client_secret='pLigWA6vX6llH7NjWBhVWmg-gJjKvg',
    user_agent='script:gaming_trend (by /u/HiGhastlyy)',
    username='HiGhastlyy',
    password='0306terror'
)

In [None]:
# Helper Functions

def get_year(utc_timestamp):
    return datetime.fromtimestamp(utc_timestamp).year

def batch_analyze(texts):
    results = []
    sentiments = twitter_roberta_sentiment_pipeline(texts, truncation=True, max_length=512)
    emotions = emotion_pipeline(texts, truncation=True, max_length=512)
    
    for sent, emo in zip(sentiments, emotions):
        results.append({
            'sentiment': sent['label'],
            'sentiment_score': sent['score'],
            'emotion': emo[0]['label'],
            'emotion_score': emo[0]['score']
        })
    return results


In [None]:
# Scraping
all_data = []

output_dir = "subreddit_outputs"
os.makedirs(output_dir, exist_ok=True)

for sub_name in SUBREDDITS:
    output_path = os.path.join(output_dir, f"subreddit_{sub_name}.csv")
    
    # Skip if already processed
    if os.path.exists(output_path):
        print(f"Skipping r/{sub_name} (already scraped)")
        continue

    print(f"\nScraping r/{sub_name}...")
    subreddit = reddit.subreddit(sub_name)
    posts = list(subreddit.top(limit=POST_LIMIT, time_filter='all'))

    all_data = []
    post_batch_texts = []
    post_batch_meta = []
    comment_batch_texts = []
    comment_batch_meta = []

    for post in tqdm(posts, desc=f"Posts in r/{sub_name}"):
        post_year = get_year(post.created_utc)
        if post_year not in YEARS:
            continue

        post_text = f"{post.title} {post.selftext}".strip()
        if len(post_text) >= 10:
            post_batch_texts.append(post_text)
            post_batch_meta.append({
                'type': 'post',
                'subreddit': sub_name,
                'year': post_year,
                'id': post.id,
                'parent_id': None,
                'text': post_text,
                'created_utc': datetime.fromtimestamp(post.created_utc).isoformat()
            })

        # Comments
        post.comments.replace_more(limit=0)
        for comment in post.comments[:10]:
            comment_text = comment.body.strip()
            if len(comment_text) < 10:
                continue
            comment_year = get_year(comment.created_utc)
            if comment_year not in YEARS:
                continue
            comment_batch_texts.append(comment_text)
            comment_batch_meta.append({
                'type': 'comment',
                'subreddit': sub_name,
                'year': comment_year,
                'id': comment.id,
                'parent_id': post.id,
                'text': comment_text,
                'created_utc': datetime.fromtimestamp(comment.created_utc).isoformat()
            })

        # Analyze batched posts
        if len(post_batch_texts) >= BATCH_SIZE:
            results = batch_analyze(post_batch_texts)
            for meta, res in zip(post_batch_meta, results):
                all_data.append({**meta, **res})
            post_batch_texts.clear()
            post_batch_meta.clear()

        if len(comment_batch_texts) >= BATCH_SIZE:
            results = batch_analyze(comment_batch_texts)
            for meta, res in zip(comment_batch_meta, results):
                all_data.append({**meta, **res})
            comment_batch_texts.clear()
            comment_batch_meta.clear()

    # Final batch
    if post_batch_texts:
        results = batch_analyze(post_batch_texts)
        for meta, res in zip(post_batch_meta, results):
            all_data.append({**meta, **res})
    if comment_batch_texts:
        results = batch_analyze(comment_batch_texts)
        for meta, res in zip(comment_batch_meta, results):
            all_data.append({**meta, **res})

    # Save this subreddit’s data
    df = pd.DataFrame(all_data)
    df.to_csv(output_path, index=False)
    print(f"Saved {len(df)} rows to {output_path}")





🚀 Scraping r/dota2...


Posts in r/dota2: 100%|██████████| 100/100 [01:29<00:00,  1.12it/s]


✅ Saved 284 rows to subreddit_outputs\subreddit_dota2.csv

🚀 Scraping r/valorant...


Posts in r/valorant: 100%|██████████| 100/100 [04:53<00:00,  2.93s/it]


✅ Saved 1033 rows to subreddit_outputs\subreddit_valorant.csv

🚀 Scraping r/leagueoflegends...


Posts in r/leagueoflegends: 100%|██████████| 100/100 [03:50<00:00,  2.31s/it]


✅ Saved 658 rows to subreddit_outputs\subreddit_leagueoflegends.csv

🚀 Scraping r/overwatch...


Posts in r/overwatch: 100%|██████████| 100/100 [01:26<00:00,  1.15it/s]


✅ Saved 243 rows to subreddit_outputs\subreddit_overwatch.csv

🚀 Scraping r/fortnite...


Posts in r/fortnite: 100%|██████████| 100/100 [01:06<00:00,  1.50it/s]


✅ Saved 367 rows to subreddit_outputs\subreddit_fortnite.csv

🚀 Scraping r/Genshin_Impact...


Posts in r/Genshin_Impact: 100%|██████████| 100/100 [05:33<00:00,  3.33s/it]


✅ Saved 1039 rows to subreddit_outputs\subreddit_Genshin_Impact.csv

🚀 Scraping r/assasinscreed...


Posts in r/assasinscreed: 100%|██████████| 100/100 [00:15<00:00,  6.46it/s]


✅ Saved 94 rows to subreddit_outputs\subreddit_assasinscreed.csv

🚀 Scraping r/amongus...


Posts in r/amongus: 100%|██████████| 100/100 [05:59<00:00,  3.60s/it]


✅ Saved 1025 rows to subreddit_outputs\subreddit_amongus.csv

🚀 Scraping r/minecraft...


Posts in r/minecraft: 100%|██████████| 100/100 [05:12<00:00,  3.13s/it]


✅ Saved 892 rows to subreddit_outputs\subreddit_minecraft.csv

🚀 Scraping r/monsterhunter...


Posts in r/monsterhunter: 100%|██████████| 100/100 [01:38<00:00,  1.01it/s]


✅ Saved 376 rows to subreddit_outputs\subreddit_monsterhunter.csv
