# Getting Started with PRAW: The Python Reddit API Wrapper
- Documentation can be found here: https://praw.readthedocs.io/en/stable/

In [1]:
import praw

# Set up your Reddit application credentials
client_id = "Ia1VQqeG_YK88PwiOhxLYg"
client_secret = "2bfP6tkp8QvQhVS6czGKLuPrnBXwog"
user_agent = "User"

# Authenticate with the Reddit API
reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent,
)

In [11]:
# Define a function to scrape 10 hot posts from any subreddit
def scrape_posts(subreddit_name, limit=10):
    subreddit = reddit.subreddit(subreddit_name)
    posts = []

    for submission in subreddit.hot(limit=limit):
        post = {
            "title": submission.title,
            "author": submission.author.name,
            "score": submission.score,
            "url": submission.url,
            "created_utc": submission.created_utc,
        }
        posts.append(post)

    return posts

# Scrape posts from r/flask
subreddit_name = "flask"
posts = scrape_posts(subreddit_name)

# Print the scraped posts
for i, post in enumerate(posts, start=1):
    print(f"{i}. {post['title']} (Score: {post['score']})")
    print(f"   by {post['author']} - {post['url']}\n")


1. A Compilation of the Best Flask Tutorials for Beginners (Score: 229)
   by gandhiN - https://www.reddit.com/r/flask/comments/pqjq9d/a_compilation_of_the_best_flask_tutorials_for/

2. Flask is Great! (Score: 52)
   by the_nine_muses_9 - https://www.reddit.com/r/flask/comments/10sevmy/flask_is_great/

3. [AF] Minimum Viable EC2 Instance Type? (Score: 4)
   by charliegriefer - https://www.reddit.com/r/flask/comments/11wu7xi/af_minimum_viable_ec2_instance_type/

4. Trying to pass a date but instead it gets calculated (Score: 2)
   by Parazitul - https://www.reddit.com/r/flask/comments/11wq8p3/trying_to_pass_a_date_but_instead_it_gets/

5. What is the correct method to re-use a navbar across multiple pages? (Score: 4)
   by Zestyclose_Car1088 - https://www.reddit.com/r/flask/comments/11wjrtr/what_is_the_correct_method_to_reuse_a_navbar/

6. Get values from a dropdown button (Score: 3)
   by Asynchronousx - https://www.reddit.com/r/flask/comments/11wivtp/get_values_from_a_dropdown_button/

## Scraping main messages

In [12]:
def scrape_subreddit(subreddit_name, num_posts):
    subreddit = reddit.subreddit(subreddit_name)
    top_posts = subreddit.hot(limit=num_posts)

    for post in top_posts:
        print(f"Title: {post.title}")
        print(f"URL: {post.url}")
        print(f"Score: {post.score}")
        print(f"Text: {post.selftext}\n")
        print("\n")

In [13]:
subreddit_name = "flask"  # Replace with the subreddit of interest
num_posts = 10  # Number of posts to fetch
scrape_subreddit(subreddit_name, num_posts)

Title: A Compilation of the Best Flask Tutorials for Beginners
URL: https://www.reddit.com/r/flask/comments/pqjq9d/a_compilation_of_the_best_flask_tutorials_for/
Score: 224
Text: I have made a list of the [best Flask tutorials](https://medium.com/quick-code/top-online-tutorials-to-learn-flask-python-c2723df5326c) for beginners to learn web development. Beginners will benefit from it.



Title: Flask is Great!
URL: https://www.reddit.com/r/flask/comments/10sevmy/flask_is_great/
Score: 52
Text: I just wanted to say how much I love having a python backend with flask. I have a background in python from machine learning. However, I am new to backend development outside of PHP and found flask to be intuitive and overall very easy to implement. I've already been able to integrate external APIs like Chatgpt into web applications with flask, other APIs, and build my own python programs. Python has been such a useful tool for me I'm really excited to see what flask can accomplish!



Title: [AF]

In [19]:
import csv

def scrape_subreddit(subreddit_name, num_posts):
    subreddit = reddit.subreddit(subreddit_name)
    recent_posts = subreddit.new(limit=num_posts)  # Fetch recent posts

    post_data = []
    for post in recent_posts:
        post_data.append({
            'Title': post.title,
            'Score': post.score,
            'URL': post.url,
            'Text': post.selftext
        })

    return post_data

def save_posts_to_csv(posts, file_name):
    with open(file_name, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Title', 'Score', 'URL', 'Text']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for post in posts:
            writer.writerow(post)


subreddit_name = "flask"  # Replace with the subreddit of interest
num_posts = 100  # Number of posts to fetch
output_file = "reddit_posts.csv"

post_data = scrape_subreddit(subreddit_name, num_posts)
save_posts_to_csv(post_data, output_file)

# PerspectiveAPI Analyses

## Set up PerspectiveAPI

In [24]:
from googleapiclient import discovery
import json

# 'Perspective Classifer' project in my gmail acct
API_KEY = 'AIzaSyCdP0h-eXJMmAP4uqUzWP2XN7KHRbi6BCc'

client = discovery.build(
  "commentanalyzer",
  "v1alpha1",
  developerKey=API_KEY,
  discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
  static_discovery=False,
)

## Function to analyze toxicity text from the scraped subreddit file

In [48]:
import pandas as pd
import time # to slow down API requests

def analyze_text_toxicity(text):
    analyze_request = {
        'comment': {'text': text},
        'requestedAttributes': {'TOXICITY': {}}
    }
    response = client.comments().analyze(body=analyze_request).execute()
    toxicity_value = response['attributeScores']['TOXICITY']['summaryScore']['value']
    return toxicity_value

def analyze_posts_toxicity(dataframe, input_file, output_file):
    # Load the CSV file into a pandas DataFrame
    df = pd.read_csv(input_file)

    # Initialize an empty list to store the toxicity values and indices
    toxicity_values = []
    valid_indices = []

    # Iterate through the rows in the DataFrame
    for index, row in df.iterrows():
        text = row['Text']

        # Skip rows with empty or NaN text values
        if pd.isna(text) or text.strip() == "":
            continue

        # Analyze the main text using the toxicity analysis code
        toxicity_value = analyze_text_toxicity(text)

        # Append the toxicity value and index to the lists
        toxicity_values.append(toxicity_value)
        valid_indices.append(index)

        # Wait for 1 seconds before making the next API request -- CHECK TO SEE
        time.sleep(1)

    # Add the toxicity values as a new column in the DataFrame
    dataframe.loc[valid_indices, 'Toxicity'] = toxicity_values

    # Save the updated DataFrame as a CSV file
    dataframe.to_csv(output_file, index=False)


### Notes on the above function:
We had to do a couple things for this to work:
1) I had to ignore rows/observations that were empty (NaN) since the toxicity analyzer needs non-empty strings.

2) Moreover, according to Perspective API's [quota limit](https://developers.perspectiveapi.com/s/about-the-api-limits-and-errors?language=en_US), I had to slow down my requests to come through once per every second or else I would exceed my quota limit.

## Analyze toxicity of Reddit posts

In [50]:
# assign input and output file here
input_file = "reddit_posts.csv"
output_file = "reddit_posts_toxicity.csv"

# Load the CSV file into a pandas DataFrame
dataframe = pd.read_csv(input_file)

analyze_posts_toxicity(dataframe, input_file, output_file)

In [35]:
df = pd.read_csv(input_file)

In [45]:
# how many rows
print(df.shape[0])
# how many columns
print(df.shape[1])

100
4
