# Sentiment Analysis - Part I (Web Scraping)

In [1]:
import json
import os
import pandas as pd
import praw
import time
from datetime import datetime, timezone
from langdetect import detect
from prawcore.exceptions import RequestException # To catch rate limit exceptions

In [2]:
# Import the config file
config_file = "config.json"
if os.path.exists(config_file):
    with open(config_file) as file:
        config = json.load(file)
        if 'client_id' in config:
            client_id = config['client_id']
            client_secret = config['client_secret']
            user_agent = config['user_agent']
            username = config['username']
            password = config['password']
        else:
            raise KeyError("API credentials not found in config file.")
else:
    raise FileNotFoundError(f"Config file {config_file} not found")

In [3]:
# Initialize the Reddit API client
reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent,
    username=username,
    password=password,
)

In [12]:
# Scraping subreddit 'queensgambit'

# Define subreddits and query
subreddit = 'queensgambit'

# Create a set to store already-seen content
seen_posts = set()

# Initialize counters
post_counter = 0
comment_counter = 0

# Initialize list to store posts
posts = []

# Fetch posts from the subreddit
subreddit = reddit.subreddit(subreddit)
subreddit_posts = subreddit.new(limit=15000)

# Scraping loop
for submission in subreddit_posts:
    try:
        # Check if the post has already been seen
        post_identifier = submission.selftext.strip().lower()

        # If the post hasn't been seen, add it
        if post_identifier not in seen_posts:
            seen_posts.add(post_identifier)
            # Convert unix timestamp to timezone-aware datetime
            created_time = datetime.fromtimestamp(submission.created_utc, tz=timezone.utc)
            created_time_str = created_time.strftime('%Y-%m-%d %H:%M:%S')

            # Store post information
            posts_data = {
                "title": submission.title,
                "id": submission.id,
                "text": submission.selftext,
                "score": submission.score,
                "url": submission.url,
                "created": created_time_str,
                "num_comments": submission.num_comments,
                "comments": [],                              # Placeholder for comments
            }

            # Fetch comments
            submission.comments.replace_more(limit=0)        # Replace MoreComments objects with actual comments
            for comment in submission.comments.list():

                    # Check if the comment has been seen
                    comment_identifier = comment.body.strip().lower()
                    if comment_identifier not in seen_posts:
                        seen_posts.add(comment_identifier)

                        # Convert unix timestamp to timezone-aware datetime
                        comment_time = datetime.fromtimestamp(comment.created_utc, tz=timezone.utc)
                        comment_time_str = comment_time.strftime('%Y-%m-%d %H:%M:%S')
                        
                        # Check, if comment has an author
                        if comment.author:
                            author = comment.author.name
                            try:
                                author_karma = comment.author.comment_karma
                            except AttributeError:
                                author_karma = None
                        else:
                            author = 'Deleted'
                            author_karma = None
                        
                        # Store comment information
                        comment_data = {
                            "comment_id": comment.id,
                            "post_id": submission.id,
                            "comment_text": comment.body,
                            "comment_score": comment.score,
                            "comment_created": comment_time_str,
                            "comment_author": author,
                            "comment_author_karma": author_karma,
                        }

                        # Append comment data to the post
                        posts_data["comments"].append(comment_data)

                        # Increment comment counter
                        comment_counter += 1

            # Append post data to the list
            posts.append(posts_data)

            # Increment post counter
            post_counter += 1

            # Print progress
            print(f"Processed {post_counter} posts and {comment_counter} comments (Total: {len(seen_posts)}).", end='\r')
        
    except prawcore.exceptions.TooManyRequests as e:
        print(f"Rate limit error occurred: {e}")
        print("Waiting before retrying...")
        time.sleep(60)
    
    # Save the data to a JSON file
    with open("./data/reddit_data.json", "w") as outfile:
        json.dump(posts, outfile, indent=4)
    
    # Sleep for 2 seconds to avoid hitting the Reddit API rate limit
    time.sleep(2)


Processed 47 posts and 354 comments (Total: 401). Current subreddit: netflix.

NameError: name 'prawcore' is not defined

In [None]:
# Define subreddits and query
subreddits = ['netflix', 'NetflixBestOf', 'television', 'TvShows']
query = 'Queen\'s Gambit'

# Create a set to store already-seen content
seen_posts = set()

# Initialize counters
post_counter = 0
comment_counter = 0

# Initialize list to store posts
posts = []

# Scraping loop
for subreddit_name in subreddits:
    subreddit = reddit.subreddit(subreddit_name)
    # Search in the current subreddit
    for submission in subreddit.search(query, limit=15000, sort='new'):
        try:
            # Check if the post has already been seen
            post_identifier = submission.selftext.strip().lower()

            # If the post hasn't been seen, add it
            if post_identifier not in seen_posts:
                seen_posts.add(post_identifier)
                # Convert unix timestamp to timezone-aware datetime
                created_time = datetime.fromtimestamp(submission.created_utc, tz=timezone.utc)
                created_time_str = created_time.strftime('%Y-%m-%d %H:%M:%S')

                # Store post information
                posts_data = {
                    "title": submission.title,
                    "id": submission.id,
                    "text": submission.selftext,
                    "score": submission.score,
                    "url": submission.url,
                    "created": created_time_str,
                    "num_comments": submission.num_comments,
                    "comments": [],                              # Placeholder for comments
                }

                # Fetch comments
                submission.comments.replace_more(limit=0)        # Replace MoreComments objects with actual comments
                for comment in submission.comments.list():

                        # Check if the comment has been seen
                        comment_identifier = comment.body.strip().lower()
                        if comment_identifier not in seen_posts:
                            seen_posts.add(comment_identifier)

                            # Convert unix timestamp to timezone-aware datetime
                            comment_time = datetime.fromtimestamp(comment.created_utc, tz=timezone.utc)
                            comment_time_str = comment_time.strftime('%Y-%m-%d %H:%M:%S')
                            
                            # Check, if comment has an author
                            if comment.author:
                                author = comment.author.name
                                try:
                                    author_karma = comment.author.comment_karma
                                except AttributeError:
                                    author_karma = None
                            else:
                                author = 'Deleted'
                                author_karma = None
                            
                            # Store comment information
                            comment_data = {
                                "comment_id": comment.id,
                                "post_id": submission.id,
                                "comment_text": comment.body,
                                "comment_score": comment.score,
                                "comment_created": comment_time_str,
                                "comment_author": author,
                                "comment_author_karma": author_karma,
                            }

                            # Append comment data to the post
                            posts_data["comments"].append(comment_data)

                            # Increment comment counter
                            comment_counter += 1

                # Append post data to the list
                posts.append(posts_data)

                # Increment post counter
                post_counter += 1

                # Print progress
                print(f"Processed {post_counter} posts and {comment_counter} comments (Total: {len(seen_posts)}). Current subreddit: {subreddit_name}.", end='\r')
            
        except prawcore.exceptions.TooManyRequests as e:
            print(f"Rate limit error occurred: {e}")
            print("Waiting before retrying...")
            time.sleep(60)
        
        # Save the data to a JSON file
        with open("./data/reddit_data.json", "a") as outfile:
            json.dump(posts, outfile, indent=4)
        
        # Sleep for 2 seconds to avoid hitting the Reddit API rate limit
        time.sleep(2)


Processed 178 posts and 2043 comments (Total: 2221). Current subreddit: netflix.mbit.

NameError: name 'prawcore' is not defined

In [17]:
import praw
import prawcore
import time
import json
from datetime import datetime, timezone

# Initialize the Reddit API client
reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent,
    username=username,
    password=password,
)

# Define the subreddits and query
subreddits = ['queensgambit', 'netflix', 'NetflixBestOf', 'television', 'TvShows']
query = 'Queen\'s Gambit'  # Only used for multiple subreddits
limit = 15000

# Create a set to store already-seen content
seen_posts = set()

# Initialize counters
post_counter = 0
comment_counter = 0

# Initialize list to store posts
posts = []

# Function to scrape a subreddit
def scrape_subreddit(subreddit_name, limit, use_query=False, query=None):
    global post_counter, comment_counter

    # Access the subreddit
    subreddit = reddit.subreddit(subreddit_name)

    # Fetch posts (with or without query)
    if use_query:
        subreddit_posts = subreddit.search(query, limit=limit, sort='new')
    else:
        subreddit_posts = subreddit.new(limit=limit)

    # Scraping loop
    for submission in subreddit_posts:
        try:
            # Check if the post has already been seen
            post_identifier = submission.selftext.strip().lower()

            # If the post hasn't been seen, add it
            if post_identifier not in seen_posts:
                seen_posts.add(post_identifier)
                # Convert unix timestamp to timezone-aware datetime
                created_time = datetime.fromtimestamp(submission.created_utc, tz=timezone.utc)
                created_time_str = created_time.strftime('%Y-%m-%d %H:%M:%S')

                # Store post information
                posts_data = {
                    "title": submission.title,
                    "id": submission.id,
                    "text": submission.selftext,
                    "score": submission.score,
                    "url": submission.url,
                    "created": created_time_str,
                    "num_comments": submission.num_comments,
                    "comments": [],  # Placeholder for comments
                }

                # Fetch comments
                submission.comments.replace_more(limit=0)  # Replace MoreComments objects with actual comments
                for comment in submission.comments.list():
                    # Check if the comment has been seen
                    comment_identifier = comment.body.strip().lower()
                    if comment_identifier not in seen_posts:
                        seen_posts.add(comment_identifier)

                        # Convert unix timestamp to timezone-aware datetime
                        comment_time = datetime.fromtimestamp(comment.created_utc, tz=timezone.utc)
                        comment_time_str = comment_time.strftime('%Y-%m-%d %H:%M:%S')

                        # Check if the comment has an author
                        if comment.author:
                            author = comment.author.name
                            try:
                                author_karma = comment.author.comment_karma
                            except AttributeError:
                                author_karma = None
                        else:
                            author = 'Deleted'
                            author_karma = None

                        # Store comment information
                        comment_data = {
                            "comment_id": comment.id,
                            "post_id": submission.id,
                            "comment_text": comment.body,
                            "comment_score": comment.score,
                            "comment_created": comment_time_str,
                            "comment_author": author,
                            "comment_author_karma": author_karma,
                        }

                        # Append comment data to the post
                        posts_data["comments"].append(comment_data)

                        # Increment comment counter
                        comment_counter += 1

                # Append post data to the list
                posts.append(posts_data)

                # Increment post counter
                post_counter += 1

                # Print progress
                print(f"Processed {post_counter} posts and {comment_counter} comments (Total: {len(seen_posts)}). Subreddit: {subreddit_name}.", end='\r')

        except prawcore.exceptions.TooManyRequests as e:
            print(f"Rate limit error occurred: {e}")
            print("Waiting before retrying...")
            time.sleep(60)
        
        # Save the data to a JSON file
        with open("./data/reddit_data.json", "w") as outfile:
            json.dump(posts, outfile, indent=4)

        # Sleep for 2 seconds to avoid hitting the Reddit API rate limit
        time.sleep(2)

# Main scraping logic
for subreddit_name in subreddits:
    if subreddit_name == 'queensgambit':
        scrape_subreddit(subreddit_name, limit, use_query=False)  # No query for 'queensgambit'
    else:
        scrape_subreddit(subreddit_name, limit, use_query=True, query=query)  # Use query for other subreddits


Rate limit error occurred: received 429 HTTP responseSubreddit: netflix.mbit.
Waiting before retrying...
Rate limit error occurred: received 429 HTTP responseSubreddit: NetflixBestOf.
Waiting before retrying...
Rate limit error occurred: received 429 HTTP response Subreddit: NetflixBestOf.
Waiting before retrying...
Rate limit error occurred: received 429 HTTP response. Subreddit: NetflixBestOf.
Waiting before retrying...
Rate limit error occurred: received 429 HTTP response. Subreddit: NetflixBestOf.
Waiting before retrying...
Rate limit error occurred: received 429 HTTP response. Subreddit: NetflixBestOf.
Waiting before retrying...
Rate limit error occurred: received 429 HTTP response. Subreddit: NetflixBestOf.
Waiting before retrying...
Processed 711 posts and 17684 comments (Total: 18409). Subreddit: TvShows.on.Of.