In [1]:
# %pip install praw
# %pip install google-generativeai
# %pip install ipywidgets

In [31]:
import google.generativeai as genai
import os

genai.configure(api_key=os.environ["GEMINI_API_KEY"])

# Create the model
generation_config = {
  "temperature": 0, # controls randomness. 0 = most deterministic (always selects highest probability token).
  "top_p": 0, # nucleus sampling: limits token selection to the most probable. 0 = most deterministic (used when temperature > 0).
  "top_k": 1, # restricts to top 'k' tokens. 1 = most deterministic (used when temperature > 0).
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
}

model = genai.GenerativeModel(
  model_name="gemini-exp-1206",
  generation_config=generation_config,
)

In [3]:
import praw
import pandas as pd

# Replace with your actual credentials
reddit = praw.Reddit(
    client_id=os.environ["PRAW_CLIENT_ID"],
    client_secret=os.environ["PRAW_CLIENT_SECRET"],
    user_agent=os.environ["PRAW_USER_AGENT"],
    username=os.environ["PRAW_USERNAME"],
    password=os.environ["PRAW_PASSWORD"],
)

# Fetch a large subset of popular subreddits (large limit makes this representative of the largest overall subreddits by subscribers, check: https://gummysearch.com/tools/top-subreddits/)
subreddits = list(reddit.subreddits.popular(limit=1000))

# Create a DataFrame using list comprehension for better performance
subs_df = pd.DataFrame([{
    "Name": subreddit.display_name,
    "Subscribers": subreddit.subscribers,
    "Description": subreddit.public_description,
    "Over 18": subreddit.over18,
    "Submission Type": subreddit.submission_type
} for subreddit in subreddits]).sort_values(by="Subscribers", ascending=False, ignore_index=True)

# Print the top 10
subs_df.head(10)

Unnamed: 0,Name,Subscribers,Description,Over 18,Submission Type
0,funny,65915170,Reddit's largest humor depository,False,any
1,AskReddit,50406011,r/AskReddit is the place to ask and answer tho...,False,self
2,gaming,44958493,The Number One Gaming forum on the Internet.,False,any
3,worldnews,43570215,"A place for major news from around the world, ...",False,link
4,todayilearned,39208843,You learn something new every day; what did yo...,False,link
5,aww,37359700,"Things that make you go AWW! -- like puppies, ...",False,link
6,Music,35820412,Reddit’s #1 Music Community,False,any
7,memes,35219610,Memes!\n\nA way of describing cultural informa...,False,any
8,movies,34170973,The goal of /r/Movies is to provide an inclusi...,False,any
9,Showerthoughts,33646166,A subreddit for sharing those miniature epipha...,False,self


In [52]:
import ast

chat_session = model.start_chat()

response = chat_session.send_message("What are some keywords I can use to create a list of subreddits which are likely to be influenced by bots because of their controversial nature? These are keywords that I would look for within a subreddit's name or description. For example: \"news\", \"politics\", \"discussion\", \"war\", \"vaccines\", \"controversial\", \"conflict\", etc.\n\nKeep the answer short, only including 50 keywords and saving them in a python list as follows [\"key1\",\"key2\",...]. Send the output as text not as code.")

bot_influence_keywords = ast.literal_eval(response.candidates[0].content.parts[0].text.replace("\n", ""))

for i in range(0, len(bot_influence_keywords), 5):
    print(*bot_influence_keywords[i:i+5])

news politics discussion war vaccines
controversial conflict debate opinion world
current events election government policy social issues
conspiracy exposed truth censorship freedom
rights activism protest revolution change
reform corruption scandal crime justice
law police military security surveillance
privacy technology media propaganda bias
fake news disinformation misinformation manipulation control
power ideology culture society


In [51]:
# Score subreddits based on subscribers and keywords in description
def calculate_bot_influence_score(row):
    score = 0
    
    # Large subscriber base increases potential for bot activity
    if row['Subscribers'] > 10000000:
        score += 5
    elif row['Subscribers'] > 5000000:
        score += 4
    elif row['Subscribers'] > 1000000:
        score += 3
        
    # Check for keywords in description and subreddit name
    description = row['Description'].lower()
    sub_name = row['Name'].lower()
    for keyword in bot_influence_keywords:
        if keyword in description:
            score += 1
        if keyword in sub_name:
            score += 1
            
    return score

subs_df['Bot Score'] = subs_df.apply(calculate_bot_influence_score, axis=1)

# Get top 10 most vulnerable subreddits
top_vulnerable = subs_df.nlargest(10, 'Bot Score')[['Name', 'Subscribers', 'Submission Type', 'Bot Score']].reset_index(drop=True)
top_vulnerable

Unnamed: 0,Name,Subscribers,Submission Type,Bot Score
0,worldnews,43570215,link,9
1,news,29224905,link,9
2,technology,17726644,any,9
3,science,33423652,link,8
4,CryptoCurrency,9372102,any,8
5,politics,8695025,link,8
6,movies,34170973,any,7
7,askscience,25977825,self,7
8,books,25660597,any,7
9,AmItheAsshole,23088653,self,7


In [9]:
# 5. Function to Fetch Posts and Comments
def fetch_posts_and_comments(subreddit_name, num_posts=100, num_comments=100):
    """
    Fetches posts and their top-level comments from a subreddit.

    Args:
        subreddit_name: The name of the subreddit.
        num_posts: The maximum number of posts to fetch.
        num_comments: The maximum number of top-level comments to fetch per post.

    Returns:
        A list of dictionaries, where each dictionary represents a post and its comments.
    """
    subreddit = reddit.subreddit(subreddit_name)
    posts_data = []

    try:
        for post in subreddit.new(limit=num_posts):  # You can change 'hot' to 'new', 'rising', etc.
            post_data = {
                "subreddit": subreddit_name,
                "post_id": post.id,
                "post_title": post.title,
                "post_author": str(post.author),
                "post_score": post.score,
                "post_upvote_ratio": post.upvote_ratio,
                "post_url": post.url,
                "post_selftext": post.selftext,
                "post_created_utc": post.created_utc,
                "comments": []
            }

            post.comments.replace_more(limit=0)  # Fetch only top-level comments, ignore "more comments"
            
            comment_count = 0
            for comment in post.comments:
                if comment_count >= num_comments:
                    break
                post_data["comments"].append({
                    "comment_id": comment.id,
                    "comment_author": str(comment.author),
                    "comment_body": comment.body,
                    "comment_score": comment.score,
                    "comment_created_utc": comment.created_utc
                })
                comment_count += 1

            posts_data.append(post_data)
            
            # Respect API rate limits
            # time.sleep(1)

    except Exception as e:
        print(f"Error fetching data from r/{subreddit_name}: {e}")

    return posts_data

# 6. Main Data Collection Loop
all_data = []
for subreddit_name in top_vulnerable['Name']:
    print(f"Fetching data from r/{subreddit_name}...")
    subreddit_data = fetch_posts_and_comments(subreddit_name, num_posts=50, num_comments=50)  # Adjust numbers as needed
    all_data.extend(subreddit_data)

# 7. Convert to DataFrame
reddit_data_df = pd.DataFrame(all_data)

# Convert lists of comments to a separate DataFrame if desired
comments_data = []
for index, row in reddit_data_df.iterrows():
    for comment in row['comments']:
        comment['post_id'] = row['post_id'] # add the relationship
        comments_data.append(comment)
comments_df = pd.DataFrame(comments_data)
# Expand the comments into its own columns
reddit_data_df = pd.concat([reddit_data_df.drop(['comments'], axis=1), pd.DataFrame(reddit_data_df['comments'].tolist()).add_prefix('comment_')], axis=1)

# 8. Save to CSV (or other format)
reddit_data_df.to_csv("reddit_posts_and_comments.csv", index=False)
comments_df.to_csv("comments.csv", index=False)

print("Data collection complete!")

Fetching data from r/worldnews...
Fetching data from r/news...
Fetching data from r/AmItheAsshole...
Fetching data from r/Art...
Fetching data from r/sports...
Fetching data from r/personalfinance...
Fetching data from r/UpliftingNews...
Fetching data from r/politics...
Fetching data from r/stocks...
Fetching data from r/AskReddit...
Data collection complete!
