In [1]:
import praw
import pandas as pd

import glob
from textblob import TextBlob
import csv
import time 

This code fetches data from various Reddit posts and stores them into pandas DataFrame. The primary steps involved are:

- Establish a connection to the Reddit API using the PRAW library and the provided credentials.
- Define a function called `fetch_subreddit_data()`. This function accepts a subreddit name and a list of post URLs. For each post, it fetches the title, author, content, score, and creation time, and appends them to a DataFrame. The function also fetches the similar data for each comment in a post if it is not deleted. It uses the `praw.models.Comment` to ensure the replies are indeed comments.
- Defines a list of subreddit names and a dictionary of post URLs for each subreddit.
- Iterates over the subreddit names, fetches data using the `fetch_subreddit_data()` function, and saves the output DataFrame to a CSV file named after the subreddit. The `index=False` argument in `to_csv` function ensures that the indexes are not included in the output CSV file.

In [2]:
# Setting up PRAW with our Reddit app credentials
reddit = praw.Reddit(
    client_id='1Q7zYJ3FBAowuOvu_axlLA',        
    client_secret='OZXwSMsECF-1ZhIvVLE98gjrVOPdXw', 
    user_agent='MoT-Group9',      
)

In [3]:
# Enable rate limit handling
reddit.read_only = True

### Functions necessary to retrieve data from pre-selected sub-reddit posts

### They have been edited since the first commit to extract less in order to be able to carry more links. 

In [4]:
#this funciton will determine the sentiment of selected posts within the selected subreddits
def get_sentiment(text):
    analysis = TextBlob(text)
    # Classify the sentiment as positive, negative, or neutral
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity < 0:
        return 'negative'
    else:
        return 'neutral'

In [5]:
def fetch_comments(submission, subreddit_name):
    submission.comments.replace_more(limit=None)
    comments_data = pd.DataFrame(columns=['subreddit', 'post_title','comment_body'])

    for comment in submission.comments.list():
        if isinstance(comment, praw.models.Comment) and comment.body != '[deleted]':
            # Appending information about each comment to the DataFrame
            comments_data = comments_data.append({
                'subreddit': subreddit_name,
                'post_title': submission.title,
                'comment_body': comment.body,
            }, ignore_index=True)

    return comments_data

In [6]:
# Creating a function to fetch subreddit data - takes in subreddit name and list of post URLs provided below.
def fetch_subreddit_data(subreddit_name, post_urls):
    columns = ['subreddit', 'post_title', 'comment_body']
    comments_data = pd.DataFrame(columns=columns)

    # Looping through the provided post URLs
    for post_url in post_urls:
        submission = reddit.submission(url=post_url)

        # Appending information about the post to the DataFrame
        comments_data = comments_data.append({
            'subreddit': subreddit_name,
            'post_title': submission.title,
            'comment_body': submission.selftext,
        }, ignore_index=True)

        # Process comments and their replies
        comments_data = comments_data.append(fetch_comments(submission, subreddit_name), ignore_index=True)

    return comments_data

In [7]:
#'https://www.reddit.com/r/singularity/comments/12983il/the_reason_i_dont_fear_artificial_intelligence/', 'https://www.reddit.com/r/singularity/comments/wnzsyz/why_everyones_afraid_of_ai/', 'https://www.reddit.com/r/singularity/comments/11i6m4j/what_scenarios_do_you_fear_the_most_in_the_age_of/', 'https://www.reddit.com/r/singularity/comments/x8io8i/why_does_everyone_want_to_fear_ai/', 'https://www.reddit.com/r/singularity/comments/12q02q1/is_anxiety_about_your_future_and_ai_destroying/', 'https://www.reddit.com/r/singularity/comments/11nyzj9/rapid_ai_progress_makes_me_feel_really_anxious/', 'https://www.reddit.com/r/singularity/comments/12doywh/ai_anxiety/', 'https://www.reddit.com/r/singularity/comments/16j8kyb/if_you_worry_about_humanity_you_should_be_more/', 'https://www.reddit.com/r/singularity/comments/11rjyrm/why_has_this_sub_been_filling_up_with_people_who/', 'https://www.reddit.com/r/singularity/comments/14le86c/what_is_everyone_so_scared_of/', 'https://www.reddit.com/r/singularity/comments/8mrnyj/how_frightened_should_we_be_of_ai/', 'https://www.reddit.com/r/singularity/comments/10oos73/im_scared_of_unemployment_in_a_world_ruled_by_ai/'

### Applying the functions to extract data

In [17]:
subreddit_names = ['singularity', 'Futurology', 'ArtificialInteligence','technology', 'AskReddit', 'privacy', 'RandomThoughts', 'AskEngineers','changemyview', 'Ecommerce', 'ChatGPT', 'YouShouldKnow',  'ArtistLounge', 'CasualConversation', 'Freelance', 'graphic_design', 'GenX', 'intj', 'ControlProblem','FinancialCareers','StableDiffusion', 'INTP'   ]
post_urls_for_each_subreddit = {
    'singularity': ['https://www.reddit.com/r/singularity/comments/132kgur/i_dont_fear_malicious_or_rogue_ai_i_fear_how_ai/'],
    'Futurology': ['https://www.reddit.com/r/Futurology/comments/9z9g0j/why_do_we_fear_artificial_intelligence_andor/'],
    'ArtificialInteligence': ['https://www.reddit.com/r/ArtificialInteligence/comments/17ddumf/what_is_expected_to_gain_from_an_ai_safety_summit/'], 
    'technology': ['https://www.reddit.com/r/technology/comments/15wnpav/police_in_england_installed_an_ai_camera_system/'], 
    'AskReddit': ['https://www.reddit.com/r/AskReddit/comments/135ixrr/how_scared_are_you_of_ai_replacing_your_career/'], 
    'privacy': ['https://www.reddit.com/r/privacy/comments/7itwrl/facial_recognition_for_public_surveillance_is/'], 
    'RandomThoughts': ['https://www.reddit.com/r/RandomThoughts/comments/125y0vu/ai_is_going_to_ruin_so_many_peoples_lives_so/'], 
    'AskEngineers': ['https://www.reddit.com/r/AskEngineers/comments/13xgz8z/whats_with_the_ai_fear/'], 
    'changemyview': ['https://www.reddit.com/r/changemyview/comments/84exk2/cmv_the_ai_scare_is_rooted_in_sensational_fear/'],
    'Ecommerce': ['https://www.reddit.com/r/ecommerce/comments/13ojidl/be_honest_are_you_scared_of_ai_taking_over_your/'], 
    'ChatGPT': ['https://www.reddit.com/r/ChatGPT/comments/135z6jw/what_are_ai_developers_seeing_privately_that_they/'], 
    'YouShouldKnow': ['https://www.reddit.com/r/YouShouldKnow/comments/120m3f6/ysk_the_future_of_monitoring_how_large_language/'], 
    'ArtistLounge': ['https://www.reddit.com/r/ArtistLounge/comments/1701von/what_scares_me_the_most_about_ai_art_is_that_it/'], 
    'CasualConversation': ['https://www.reddit.com/r/CasualConversation/comments/121vk3k/my_parents_have_always_said_new_technology_scares/'], 
    'Freelance': ['https://www.reddit.com/r/freelance/comments/11wekfi/anxiety_around_ai/'], 
    'graphic_design': ['https://www.reddit.com/r/graphic_design/comments/wre0zb/am_i_the_only_one_scared_from_ai_replacing_his_job/'], 
    'GenX': ['https://www.reddit.com/r/GenX/comments/17fl2n1/anyone_else_low_key_terrified_of_ais_impact_on/'], 
    'intj': ['https://www.reddit.com/r/intj/comments/11mxwjy/is_the_future_of_ai_terrifying_to_anybody_else/'], 
    'ControlProblem': ['https://www.reddit.com/r/ControlProblem/comments/189vy8r/terrified_about_ai_and_agiasi/'], 
    'FinancialCareers':['https://www.reddit.com/r/FinancialCareers/comments/122c3qk/im_so_scared_of_ai/'], 
    'StableDiffusion': ['https://www.reddit.com/r/StableDiffusion/comments/16q0hun/seriously_whats_with_the_rampant_aiphobia_that/'], 
    'INTP': ['https://www.reddit.com/r/INTP/comments/12xpv9e/im_honestly_a_little_baffled_as_to_why_theres/']
}

github_linked_folder = '/work/GitHub_ML_Deepnote/Machine Learning/1. Extracted Reddit Data/'

### Intervention

In [18]:
# Function to make a Reddit API request
def make_reddit_api_request(subreddit_name):
    global query_count, last_query_time

    # Calculate the time elapsed since the last query
    time_elapsed = time.time() - last_query_time

    # Check if we need to add a delay to stay within the rate limit
    if query_count >= queries_per_minute and time_elapsed < 60:
        # Sleep for the remaining time in the minute
        time.sleep(60 - time_elapsed)
        # Reset query count and last query time
        query_count = 0
        last_query_time = time.time()

    # Make the API request here
    comments_data = fetch_subreddit_data(subreddit_name, post_urls_for_each_subreddit[subreddit_name])

    # Specify the path for saving CSV files within the GitHub-linked folder
    csv_file_path = f'{github_linked_folder}{subreddit_name}_comments_data.csv'

    # Save the CSV file
    comments_data.to_csv(csv_file_path, index=False, encoding='utf-8')

    # Update query count and last query time
    query_count += 1
    last_query_time = time.time()


In [16]:
# Initialising variables to track query count and last query time
query_count = 0
last_query_time = time.time()

# Defining the rate limit
queries_per_minute = 100

#Application:
for subreddit_name in subreddit_names:
    make_reddit_api_request(subreddit_name)

# Creating a csv file containing data for all subreddits
all_dataframes = []

for subreddit_name in subreddit_names:
    comments_data = pd.read_csv(f'{github_linked_folder}{subreddit_name}_comments_data.csv')
    all_dataframes.append(comments_data)

# Concatenating all DataFrames into one
all_comments_data = pd.concat(all_dataframes, ignore_index=True)

TooManyRequests: received 429 HTTP response

In [11]:
# Specify the path for saving the combined CSV file
combined_csv_file_path = f'{github_linked_folder}0_all_comments_data.csv'

# Saving the combined CSV file
all_comments_data.to_csv(combined_csv_file_path, index=False, encoding='utf-8')

### End of intervention

### Creating a summary of subreddit posts

In [12]:
# Create a summary DataFrame with the total number of comments for each post
summary_data = all_comments_data.groupby(['subreddit', 'post_title']).size().reset_index(name='total_comments')

# Add the 'url' column based on the predefined URLs
summary_data['url'] = summary_data['subreddit'].map(post_urls_for_each_subreddit)

# Add a 'post_id' column using the index as a unique identifier
summary_data['post_id'] = summary_data.index.map(lambda x: f'{x+1:03d}')

# Apply the get_sentiment function to each row and append the sentiment information
summary_data['post_sentiment'] = summary_data['post_title'].apply(get_sentiment)

# Rearrange the order of columns
summary_data = summary_data[['subreddit', 'post_id', 'post_sentiment', 'post_title', 'total_comments', 'url']]

# Specifying the path for saving the overall summary CSV file
overall_summary_csv_file_path = '/work/GitHub_ML_Deepnote/Machine Learning/1. Extracted Reddit Data/0_subreddits_summary.csv'

# Save the overall summary CSV file
summary_data.to_csv(overall_summary_csv_file_path, index=False)

# Display the overall summary data
print(summary_data)

               subreddit post_id post_sentiment  \
0  ArtificialInteligence     001        neutral   
1  ArtificialInteligence     002        neutral   
2  ArtificialInteligence     003       positive   
3             Futurology     004       positive   
4             Futurology     005       positive   
5             Futurology     006       positive   
6             Futurology     007       negative   
7             Futurology     008       negative   
8            singularity     009       negative   

                                          post_title  total_comments  \
0                      "AI is gonna ruin the world!"             100   
1  I think AI should be removed from public use b...              39   
2  What is expected to gain from an AI safety sum...              59   
3  AI fear-mongering is irrational panic, and it’...             423   
4  An Honest admission, I fear the upcoming techn...             755   
5  Is doomsday talk about AI a result of billiona...     

In [13]:
unique_subreddits_count = summary_data['subreddit'].nunique()
print(f'Total unique subreddits: {unique_subreddits_count}')

Total unique subreddits: 3


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=f64215d6-debc-46bd-b273-63565459a66d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>