In [1]:
import praw
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
from tqdm import tqdm
from datetime import datetime
import os
    
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))


  from .autonotebook import tqdm as notebook_tqdm


True
NVIDIA GeForce RTX 3060 Ti


In [12]:
# Config

DEVICE = 0 if torch.cuda.is_available() else -1
YEARS = list(range(2020, 2025))
SUBREDDITS = [
    'dota2', 'valorant', 'leagueoflegends', 'overwatch', 'fortnite',
    'Genshin_Impact', 'assasinscreed', 'amongus', 'minecraft', 'monsterhunter'
]
POST_LIMIT = 200
BATCH_SIZE = 32

In [13]:
# Load HuggingFace Pipelines (GPU-enabled)
twitter_roberta_sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment",
    tokenizer="cardiffnlp/twitter-roberta-base-sentiment",
    device=DEVICE
)



Device set to use cuda:0


In [14]:
# Reddit auth
reddit = praw.Reddit(
    client_id='q_dYyqYYdNNInGsM-lC9Xg',
    client_secret='pLigWA6vX6llH7NjWBhVWmg-gJjKvg',
    user_agent='script:gaming_trend (by /u/HiGhastlyy)',
    username='HiGhastlyy',
    password='0306terror'
)

In [15]:
# Helper Functions

def get_year(utc_timestamp):
    return datetime.fromtimestamp(utc_timestamp).year

def batch_analyze(texts):
    results = []
    sentiments = twitter_roberta_sentiment_pipeline(texts, truncation=True, max_length=512)
   
    
    for sent in sentiments:
        results.append({
            'sentiment': sent['label'],
            'sentiment_score': sent['score'],
        })
    return results


In [19]:
# Scraping
all_data = []

output_dir = "subreddit_outputs"
os.makedirs(output_dir, exist_ok=True)

for sub_name in SUBREDDITS:
    output_path = os.path.join(output_dir, f"subreddit_{sub_name}.csv")
    
    # Skip if already processed
    if os.path.exists(output_path):
        print(f"Skipping r/{sub_name} (already scraped)")
        continue

    print(f"\nScraping r/{sub_name}...")
    subreddit = reddit.subreddit(sub_name)
    posts = list(subreddit.top(limit=POST_LIMIT, time_filter='all'))

    all_data = []
    post_batch_texts = []
    post_batch_meta = []
    

    for post in tqdm(posts, desc=f"Posts in r/{sub_name}"):
        post_year = get_year(post.created_utc)
        if post_year not in YEARS:
            continue

        post_text = f"{post.title} {post.selftext}".strip()
        if len(post_text) >= 10:
            post_batch_texts.append(post_text)
            post_batch_meta.append({
                'type': 'post',
                'subreddit': sub_name,
                'year': post_year,
                'id': post.id,
                'text': post_text,
                'created_utc': datetime.fromtimestamp(post.created_utc).isoformat()
            })

       

    # Final batch
    if post_batch_texts:
        results = batch_analyze(post_batch_texts)
        for meta, res in zip(post_batch_meta, results):
            all_data.append({**meta, **res})


    # Save this subreddit’s data
    df = pd.DataFrame(all_data)
    df.to_csv(output_path, index=False)
    print(f"Saved {len(df)} rows to {output_path}")





Scraping r/dota2...


Posts in r/dota2: 100%|██████████| 200/200 [00:00<00:00, 416928.83it/s]


Saved 61 rows to subreddit_outputs\subreddit_dota2.csv

Scraping r/valorant...


Posts in r/valorant: 100%|██████████| 200/200 [00:00<00:00, 289661.88it/s]


Saved 200 rows to subreddit_outputs\subreddit_valorant.csv

Scraping r/leagueoflegends...


Posts in r/leagueoflegends: 100%|██████████| 200/200 [00:00<00:00, 342952.09it/s]


Saved 144 rows to subreddit_outputs\subreddit_leagueoflegends.csv

Scraping r/overwatch...


Posts in r/overwatch: 100%|██████████| 200/200 [00:00<00:00, 558867.95it/s]


Saved 48 rows to subreddit_outputs\subreddit_overwatch.csv

Scraping r/fortnite...


Posts in r/fortnite: 100%|██████████| 200/200 [00:00<00:00, 480722.52it/s]


Saved 69 rows to subreddit_outputs\subreddit_fortnite.csv

Scraping r/Genshin_Impact...


Posts in r/Genshin_Impact: 100%|██████████| 200/200 [00:00<00:00, 323759.48it/s]


Saved 186 rows to subreddit_outputs\subreddit_Genshin_Impact.csv

Scraping r/assasinscreed...


Posts in r/assasinscreed: 100%|██████████| 200/200 [00:00<00:00, 652809.96it/s]


Saved 19 rows to subreddit_outputs\subreddit_assasinscreed.csv

Scraping r/amongus...


Posts in r/amongus: 100%|██████████| 200/200 [00:00<00:00, 302183.29it/s]


Saved 187 rows to subreddit_outputs\subreddit_amongus.csv

Scraping r/minecraft...


Posts in r/minecraft: 100%|██████████| 200/200 [00:00<00:00, 173497.58it/s]


Saved 165 rows to subreddit_outputs\subreddit_minecraft.csv

Scraping r/monsterhunter...


Posts in r/monsterhunter: 100%|██████████| 200/200 [00:00<00:00, 355901.91it/s]


Saved 73 rows to subreddit_outputs\subreddit_monsterhunter.csv
