In [None]:
import praw
import pandas as pd

import glob
from textblob import TextBlob
import csv

This code fetches data from various Reddit posts and stores them into pandas DataFrame. The primary steps involved are:

- Establish a connection to the Reddit API using the PRAW library and the provided credentials.
- Define a function called `fetch_subreddit_data()`. This function accepts a subreddit name and a list of post URLs. For each post, it fetches the title, author, content, score, and creation time, and appends them to a DataFrame. The function also fetches the similar data for each comment in a post if it is not deleted. It uses the `praw.models.Comment` to ensure the replies are indeed comments.
- Defines a list of subreddit names and a dictionary of post URLs for each subreddit.
- Iterates over the subreddit names, fetches data using the `fetch_subreddit_data()` function, and saves the output DataFrame to a CSV file named after the subreddit. The `index=False` argument in `to_csv` function ensures that the indexes are not included in the output CSV file.

In [None]:
# Setting up PRAW with our Reddit app credentials
reddit = praw.Reddit(
    client_id='1Q7zYJ3FBAowuOvu_axlLA',        
    client_secret='OZXwSMsECF-1ZhIvVLE98gjrVOPdXw', 
    user_agent='MoT-Group9',      
)

### Functions necessary to retrieve data from pre-selected sub-reddit posts

### They have been edited since the first commit to extract less in order to be able to carry more links. 

In [None]:
#this funciton will determine the sentiment of selected posts within the selected subreddits
def get_sentiment(text):
    analysis = TextBlob(text)
    # Classify the sentiment as positive, negative, or neutral
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity < 0:
        return 'negative'
    else:
        return 'neutral'

In [None]:
def fetch_comments(submission, subreddit_name):
    submission.comments.replace_more(limit=None)
    comments_data = pd.DataFrame(columns=['subreddit', 'post_title','comment_body'])

    for comment in submission.comments.list():
        if isinstance(comment, praw.models.Comment) and comment.body != '[deleted]':
            # Appending information about each comment to the DataFrame
            comments_data = comments_data.append({
                'subreddit': subreddit_name,
                'post_title': submission.title,
                #'post_sentiment': get_sentiment(submission.title),
                #'comment_id': comment.id,
                #'parent_id': comment.parent_id,
                #'comment_author': comment.author.name if comment.author else '[deleted]',
                'comment_body': comment.body,
                #'comment_score': comment.score,
                #'comment_created_utc': pd.to_datetime(comment.created_utc, unit='s')
            }, ignore_index=True)

    return comments_data

In [None]:
# Creating a function to fetch subreddit data - takes in subreddit name and list of post URLs provided below.
def fetch_subreddit_data(subreddit_name, post_urls):
    columns = ['subreddit', 'post_title', 'comment_body']
    comments_data = pd.DataFrame(columns=columns)

    # Looping through the provided post URLs
    for post_url in post_urls:
        submission = reddit.submission(url=post_url)

        # Appending information about the post to the DataFrame
        comments_data = comments_data.append({
            'subreddit': subreddit_name,
            'post_title': submission.title,
            #'post_sentiment': get_sentiment(submission.title),
            #'comment_id': submission.id,
            #'parent_id': None,
            #'comment_author': submission.author.name if submission.author else '[deleted]',
            'comment_body': submission.selftext,
            #'comment_score': submission.score,
            #'comment_created_utc': pd.to_datetime(submission.created_utc, unit='s')
        }, ignore_index=True)

        # Process comments and their replies
        comments_data = comments_data.append(fetch_comments(submission, subreddit_name), ignore_index=True)

    return comments_data

### Applying the functions to extract data

In [None]:
# Example usage:
subreddit_names = ['singularity', 'Futurology', 'ArtificialInteligence']
post_urls_for_each_subreddit = {
    'singularity': ['https://www.reddit.com/r/singularity/comments/132kgur/i_dont_fear_malicious_or_rogue_ai_i_fear_how_ai/', 'https://www.reddit.com/r/singularity/comments/12983il/the_reason_i_dont_fear_artificial_intelligence/'],
    'Futurology': ['https://www.reddit.com/r/Futurology/comments/9z9g0j/why_do_we_fear_artificial_intelligence_andor/'],
    'ArtificialInteligence': ['https://www.reddit.com/r/ArtificialInteligence/comments/17ddumf/what_is_expected_to_gain_from_an_ai_safety_summit/']
}

github_linked_folder = '/work/GitHub_ML_Deepnote/Machine Learning/1. Extracted Reddit Data/'

In [None]:
# Creating cvs files categorising data by subreddit
for subreddit_name in subreddit_names:
    comments_data = fetch_subreddit_data(subreddit_name, post_urls_for_each_subreddit[subreddit_name])

    # Specify the path for saving CSV files within the GitHub-linked folder
    csv_file_path = f'{github_linked_folder}{subreddit_name}_comments_data.csv'

    # Save the CSV file
    comments_data.to_csv(csv_file_path, index=False, encoding='utf-8')



In [None]:
#Creating a csv file with containing data for all subreddits
all_dataframes = []

for subreddit_name in subreddit_names:
    comments_data = fetch_subreddit_data(subreddit_name, post_urls_for_each_subreddit[subreddit_name])
    all_dataframes.append(comments_data)

    # Specify the path for saving CSV files within the GitHub-linked folder
    csv_file_path = f'{github_linked_folder}{subreddit_name}_comments_data.csv'

    # Save the CSV file
    comments_data.to_csv(csv_file_path, index=False, encoding='utf-8')

# Concatenate all DataFrames into one
all_comments_data = pd.concat(all_dataframes, ignore_index=True)

# Specify the path for saving the combined CSV file
combined_csv_file_path = f'{github_linked_folder}0_all_comments_data.csv'

# Save the combined CSV file
all_comments_data.to_csv(combined_csv_file_path, index=False, encoding='utf-8')

Will likely want to get rid of comment_authors and use comments ids instead

### Creating a summary of subreddit posts

In [None]:
# Create a summary DataFrame with the total number of comments for each post
summary_data = all_comments_data.groupby(['subreddit', 'post_title']).size().reset_index(name='total_comments')

# Add the 'url' column based on the predefined URLs
summary_data['url'] = summary_data['subreddit'].map(post_urls_for_each_subreddit)

# Add a 'post_id' column using the index as a unique identifier
summary_data['post_id'] = summary_data.index.map(lambda x: f'{x+1:03d}')

# Apply the get_sentiment function to each row and append the sentiment information
summary_data['post_sentiment'] = summary_data['post_title'].apply(get_sentiment)

# Rearrange the order of columns
summary_data = summary_data[['subreddit', 'post_id', 'post_sentiment', 'post_title', 'total_comments', 'url']]

# Specifying the path for saving the overall summary CSV file
overall_summary_csv_file_path = '/work/GitHub_ML_Deepnote/Machine Learning/1. Extracted Reddit Data/0_subreddits_summary.csv'

# Save the overall summary CSV file
summary_data.to_csv(overall_summary_csv_file_path, index=False)

# Display the overall summary data
print(summary_data)

               subreddit post_id post_sentiment  \
0  ArtificialInteligence     001       positive   
1             Futurology     002       negative   
2            singularity     003       negative   
3            singularity     004       negative   

                                          post_title  total_comments  \
0  What is expected to gain from an AI safety sum...              59   
1  Why do we fear artificial intelligence and/or ...              42   
2  I don't fear malicious or rogue AI - I fear ho...             153   
3    The reason I don’t fear artificial intelligence              32   

                                                 url  
0  [https://www.reddit.com/r/ArtificialInteligenc...  
1  [https://www.reddit.com/r/Futurology/comments/...  
2  [https://www.reddit.com/r/singularity/comments...  
3  [https://www.reddit.com/r/singularity/comments...  


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=f64215d6-debc-46bd-b273-63565459a66d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>