In [None]:
# %pip install praw
# %pip install google-generativeai
# %pip install ipywidgets

In [1]:
import google.generativeai as genai
import os

genai.configure(api_key=os.environ["GEMINI_API_KEY"])

In [11]:
import praw
import pandas as pd

# Replace with your actual credentials
reddit = praw.Reddit(
    client_id=os.environ["PRAW_CLIENT_ID"],
    client_secret=os.environ["PRAW_CLIENT_SECRET"],
    user_agent=os.environ["PRAW_USER_AGENT"],
    username=os.environ["PRAW_USERNAME"],
    password=os.environ["PRAW_PASSWORD"],
)

# Fetch a large subset of popular subreddits (large limit makes this representative of the largest overall subreddits by subscribers, check: https://gummysearch.com/tools/top-subreddits/)
subreddits = list(reddit.subreddits.popular(limit=1000))

# Create a DataFrame using list comprehension for better performance
subs_df = pd.DataFrame([{
    "Name": subreddit.display_name,
    "Subscribers": subreddit.subscribers,
    "Description": subreddit.public_description,
    "Over 18": subreddit.over18,
    "Submission Type": subreddit.submission_type
} for subreddit in subreddits]).sort_values(by="Subscribers", ascending=False, ignore_index=True)

# Print the top 10
subs_df.head(10)


Unnamed: 0,Name,Subscribers,Description,Over 18,Submission Type
0,funny,65904846,Reddit's largest humor depository,False,any
1,AskReddit,50382485,r/AskReddit is the place to ask and answer tho...,False,self
2,gaming,44948179,The Number One Gaming forum on the Internet.,False,any
3,worldnews,43556495,"A place for major news from around the world, ...",False,link
4,todayilearned,39198977,You learn something new every day; what did yo...,False,link
5,aww,37356069,"Things that make you go AWW! -- like puppies, ...",False,link
6,Music,35809226,Reddit’s #1 Music Community,False,any
7,memes,35215501,Memes!\n\nA way of describing cultural informa...,False,any
8,movies,34166418,The goal of /r/Movies is to provide an inclusi...,False,any
9,Showerthoughts,33638524,A subreddit for sharing those miniature epipha...,False,self


In [12]:
# Create a list of keywords that might indicate bot influence potential
bot_influence_keywords = ['news', 'memes', 'discussion', 'questions', 'share', 'post', 'community']

# Score subreddits based on subscribers and keywords in description
def calculate_bot_influence_score(row):
    score = 0
    
    # Large subscriber base increases potential for bot activity
    if row['Subscribers'] > 10000000:
        score += 3
    elif row['Subscribers'] > 5000000:
        score += 2
    elif row['Subscribers'] > 1000000:
        score += 1
        
    # Check for keywords in description
    description = row['Description'].lower()
    for keyword in bot_influence_keywords:
        if keyword in description:
            score += 1
            
    return score

# Filter for link/self submissions and calculate scores
bot_vulnerable_subs = subs_df[
    (subs_df['Submission Type'].isin(['link', 'self']))
].copy()

bot_vulnerable_subs['bot_score'] = bot_vulnerable_subs.apply(calculate_bot_influence_score, axis=1)

# Get top 10 most vulnerable subreddits
top_vulnerable = bot_vulnerable_subs.nlargest(10, 'bot_score')[['Name', 'Subscribers', 'Submission Type', 'bot_score']]
print("Top 10 subreddits potentially vulnerable to bot influence:")
print(top_vulnerable)

Top 10 subreddits potentially vulnerable to bot influence:
              Name  Subscribers Submission Type  bot_score
10         science     33418689            link          5
31   UpliftingNews     20188675            link          5
103       AskWomen      5548640            self          5
1        AskReddit     50382485            self          4
3        worldnews     43556495            link          4
5              aww     37356069            link          4
12           Jokes     30343746            self          4
13            news     29220252            link          4
26             Art     22367812            link          4
27          sports     21706259            link          4


In [None]:
import random
# import time

# Filter for subreddits that allow only self or link submissions
eligible_subreddits = [
    subreddit
    for subreddit in subreddits[:100]
    if subreddit.submission_type in ("self", "link")
]

# Select 10 random subreddits from the filtered list
if len(eligible_subreddits) >= 10:
    random_subreddits = random.sample(eligible_subreddits, 10)
else:
    random_subreddits = eligible_subreddits
    print("Warning: Less than 10 subreddits found matching the criteria.")

# --- POST FETCHING ---
posts_data = []
for subreddit in random_subreddits:
    print(f"Fetching posts from r/{subreddit.display_name}...")
    try:
        # Fetch posts from the 'hot' category
        subreddit_posts = []  # Keep track of posts for this subreddit
        for post in subreddit.hot(limit=None):  # Use limit=None for maximum allowed
            subreddit_posts.append(post)  # Add post to the list

        for post in subreddit_posts:
            posts_data.append(
                {
                    "subreddit": subreddit.display_name,
                    "title": post.title,
                    "author": post.author.name if post.author else "[deleted]",
                    "score": post.score,
                    "upvote_ratio": post.upvote_ratio,
                    "num_comments": post.num_comments,
                    "created_utc": post.created_utc,
                    "selftext": post.selftext,  # Content of self-posts
                    "url": post.url,  # URL for link posts
                    "permalink": post.permalink,  # Permalink to the post
                }
            )
    except Exception as e:
        print(f"   Error fetching posts from r/{subreddit.display_name}: {e}")
        print(f"   Skipping to the next subreddit...")
        continue

    print(
        f"   Fetched {len(subreddit_posts)} posts from r/{subreddit.display_name}."
    )
    # time.sleep(2)  # Add a delay to respect API rate limits

# --- CREATE DATAFRAME ---
posts_df = pd.DataFrame(posts_data)

# --- DISPLAY OR SAVE DATAFRAME ---
print(posts_df)
# Or, save to CSV:
# posts_df.to_csv("reddit_posts.csv", index=False)