# Sentiment Analysis - Part I (Web Scraping)

In [5]:
import json
import os
import pandas as pd
import praw
import time
from datetime import datetime, timezone
from langdetect import detect
from prawcore.exceptions import RequestException # To catch rate limit exceptions

In [6]:
# Import the config file
config_file = "config.json"
if os.path.exists(config_file):
    with open(config_file) as file:
        config = json.load(file)
        if 'client_id' in config:
            client_id = config['client_id']
            client_secret = config['client_secret']
            user_agent = config['user_agent']
            username = config['username']
            password = config['password']
        else:
            raise KeyError("API credentials not found in config file.")
else:
    raise FileNotFoundError(f"Config file {config_file} not found")

In [7]:
# Initialize the Reddit API client
reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent,
    username=username,
    password=password,
)

In [None]:
# Define subreddits and query
subreddits = ['queensgambit', 'netflix', 'NetflixBestOf', 'television', 'TvShows']
query = 'Queen\'s Gambit'

# Create a set to store already-seen content
seen_posts = set()

# Initialize counters
post_counter = 0
comment_counter = 0

# Initialize list to store posts
posts = []

# Scraping loop
for subreddit_name in subreddits:
    subreddit = reddit.subreddit(subreddit_name)

    # Search in the current subreddit
    for submission in subreddit.search(query, limit=15000, sort='new'):
        try:
            # Ensure the post text is non-empty and detect the language of the post text
            if submission.selftext and detect(submission.selftext) == 'en':
                try:
                    # Check if the post has already been seen
                    post_identifier = submission.selftext.strip().lower()

                    # If the post hasn't been seen, add it
                    if post_identifier not in seen_posts:
                        seen_posts.add(post_identifier)
                        # Convert unix timestamp to timezone-aware datetime
                        created_time = datetime.fromtimestamp(submission.created_utc, tz=timezone.utc)
                        created_time_str = created_time.strftime('%Y-%m-%d %H:%M:%S')

                        # Store post information
                        posts_data = {
                            "title": submission.title,
                            "id": submission.id,
                            "text": submission.selftext,
                            "score": submission.score,
                            "url": submission.url,
                            "created": created_time_str,
                            "num_comments": submission.num_comments,
                            "comments": [],                              # Placeholder for comments
                        }

                        # Fetch comments
                        submission.comments.replace_more(limit=0)        # Replace MoreComments objects with actual comments
                        for comment in submission.comments.list():
                            # Check if the comment is non-empty and in English
                            if comment.body and detect(comment.body) == 'en':
                                # Check if the comment has been seen
                                comment_identifier = comment.body.strip().lower()
                                if comment_identifier not in seen_posts:
                                    seen_posts.add(comment_identifier)

                                    # Convert unix timestamp to timezone-aware datetime
                                    comment_time = datetime.fromtimestamp(comment.created_utc, tz=timezone.utc)
                                    comment_time_str = comment_time.strftime('%Y-%m-%d %H:%M:%S')
                                    
                                    # Check, if comment has an author
                                    if comment.author:
                                        author = comment.author.name
                                        try:
                                            author_karma = comment.author.comment_karma
                                        except AttributeError:
                                            author_karma = None
                                    else:
                                        author = 'Deleted'
                                        author_karma = None
                                    
                                    # Store comment information
                                    comment_data = {
                                        "comment_id": comment.id,
                                        "post_id": submission.id,
                                        "comment_text": comment.body,
                                        "comment_score": comment.score,
                                        "comment_created": comment_time_str,
                                        "comment_author": author,
                                        "comment_author_karma": author_karma,
                                    }

                                    # Append comment data to the post
                                    posts_data["comments"].append(comment_data)

                                    # Increment comment counter
                                    comment_counter += 1

                        # Append post data to the list
                        posts.append(posts_data)

                        # Increment post counter
                        post_counter += 1

                        # Print progress
                        print(f"Processed {post_counter} posts and {comment_counter} comments (Total: {len(seen_posts)}). Current subreddit: {subreddit_name}.", end='\r')
                    
                except prawcore.exceptions.TooManyRequests as e:
                    print(f"Rate limit error occurred: {e}")
                    print("Waiting before retrying...")
                    time.sleep(60)
                
                # Save the data to a JSON file
                with open("./data/reddit_posts.json", "w") as outfile:
                    json.dump(posts, outfile, indent=4)
                
                # Sleep for 2 seconds to avoid hitting the Reddit API rate limit
                time.sleep(2)
        
        except Exception as e:
            print(f"Error detecting language: {e}")
            continue


Error detecting language: No features in text. Current subreddit: queensgambit.
Error detecting language: name 'prawcore' is not definedsubreddit: queensgambit.
Error detecting language: name 'prawcore' is not definedt subreddit: queensgambit.
Error detecting language: name 'prawcore' is not definedent subreddit: netflix.it.
Error detecting language: name 'prawcore' is not definedent subreddit: netflix.
Error detecting language: name 'prawcore' is not definedrent subreddit: netflix.
Error detecting language: name 'prawcore' is not defined
Error detecting language: name 'prawcore' is not definedrent subreddit: netflix.
Error detecting language: name 'prawcore' is not definedrent subreddit: netflix.
Error detecting language: name 'prawcore' is not definedrent subreddit: netflix.
Error detecting language: name 'prawcore' is not defined
Error detecting language: name 'prawcore' is not definedrent subreddit: netflix.
Error detecting language: name 'prawcore' is not defined
Error detecting l