# Sentiment Analysis Project

In [29]:
import json
import os
import pandas as pd
import praw
import time
from datetime import datetime, timezone
from prawcore.exceptions import RequestException # To catch rate limit exceptions

In [30]:
config_file = "config.json"
if os.path.exists(config_file):
    with open(config_file) as file:
        config = json.load(file)
        if 'client_id' in config:
            client_id = config['client_id']
            client_secret = config['client_secret']
            user_agent = config['user_agent']
            username = config['username']
            password = config['password']
        else:
            raise KeyError("API credentials not found in config file.")
else:
    raise FileNotFoundError(f"Config file {config_file} not found")

In [31]:
# Initialize the Reddit API client
reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent,
    username=username,
    password=password,
)

Comment: Access works only with username, not with email address!

In [32]:
# Define the subreddit and the number of posts to fetch
subreddit_name = 'queensgambit'
post_limit = 10000

# Fetch posts from the subreddit
subreddit = reddit.subreddit(subreddit_name)
posts = subreddit.new(limit=post_limit) # .new fetches the most recent submissions

In [34]:
# Initialize an empty list to store the data
reddit_data = []
post_counter = 0
comment_counter = 0
# Process the posts
for post in posts:
    try:
        # Convert the Unix timestamp to a timezone-aware datetime
        created_time = datetime.fromtimestamp(post.created_utc, tz=timezone.utc)
        # Format the datetime as a string
        created_time_str = created_time.strftime('%Y-%m-%d %H:%M:%S')

        post_data = {
            "title": post.title,
            "score": post.score,
            "id": post.id,
            "url": post.url,
            "created": created_time_str,
            "text": post.selftext,
            "num_comments": post.num_comments,
            "comments": []

        }

        '''print(f"Title: {post.title}")
        print(f"Score: {post.score}")
        print(f"ID: {post.id}")
        print(f"URL: {post.url}")
        print(f"Created: {created_time_str}")
        print(f"Text: {post.selftext}")
        print(f"Number of Comments: {post.num_comments}")
        print("-" * 80)'''

        # Fetch comments
        post.comments.replace_more(limit=0)  # Ensures that all 'MoreComments' objects are replaced, allowing access to all comments

        for comment in post.comments.list():
            comment_time = datetime.fromtimestamp(comment.created_utc, tz=timezone.utc)
            comment_time_str = comment_time.strftime('%Y-%m-%d %H:%M:%S')
            # Check, if author exists
            if comment.author:
                author = comment.author.name
                # Attempt to fetch author's comment karma
                try:
                    author_karma = comment.author.comment_karma
                except AttributeError:
                    author_karma = None
            else:
                author_karma = None
                author = "Deleted"

            comment_data = {
                "comment_id": comment.id,
                "post_id": post.id,             # To link the comment to its parent post
                "comment_text": comment.body,
                "comment_score": comment.score,
                "comment_created": comment_time_str,
                "comment_author": author,
                "author_karma": author_karma
            }

            '''print(f"  Comment ID: {comment.id}")
            print(f"  Comment Text: {comment.body}")
            print(f"  Comment Score: {comment.score}")
            print(f"  Comment Created: {comment_time_str}")
            print(f"  Comment Author: {comment.author}")
            print(f"  Comment Author: {author}")
            print(f"  Author Karma: {author_karma}")
            print("-" * 80)'''

            post_data["comments"].append(comment_data)
            comment_counter += 1

        reddit_data.append(post_data)
        post_counter += 1
        print(f"Processed {post_counter} posts and {comment_counter} comments (Total: {post_counter + comment_counter}).", end="\r")
    
    except RequestException as e:
        print(f"Rate limit error occurred: {e}")
        print("Waiting before retrying...")
        time.sleep(60)

    except praw.exceptions.PRAWException as e:
        print(f"An error occurred: {e}")
        time.sleep(60)
    
    # Save the data to a JSON file
    with open("reddit_data.json", "w") as outfile:
        json.dump(reddit_data, outfile, indent=4)
    
    time.sleep(2)  # Sleep for 2 seconds to avoid hitting the Reddit API rate limit

Processed 967 posts and 6574 comments (Total: 7541).

In [19]:
# Load JSON data
with open("reddit_data.json", "r") as file:
    data = json.load(file)

# Normalize the JSON data
posts_df = pd.json_normalize(data)

# Normalize comments data
comments_data = []
for post in data:
    for comment in post['comments']:
        #comment['post_id'] = post['post_id']
        comments_data.append(comment)

comments_df = pd.DataFrame(comments_data)

# Display DataFrames
posts_df.head()

Unnamed: 0,title,score,id,url,created,text,num_comments,comments
0,Late to Play,43,1futic9,https://www.reddit.com/r/queensgambit/comments...,2024-10-02 23:00:44,"I know I’m a bit late to the game, but The Que...",13,"[{'comment_id': 'lq7ugjb', 'comment_text': 'I ..."
1,Funny question,10,1fs558p,https://www.reddit.com/r/queensgambit/comments...,2024-09-29 14:06:51,So I have OCD and one of my triggers is people...,14,"[{'comment_id': 'lphutpg', 'comment_text': 'So..."
2,Alma and Beth,50,1fqc156,https://www.reddit.com/r/queensgambit/comments...,2024-09-27 01:07:02,Just an appreciation post for Alma and Beth's ...,11,"[{'comment_id': 'lp4bmil', 'comment_text': 'Th..."
3,Beth will forever live rent free in my head🥰,319,1fojobd,https://www.reddit.com/gallery/1fojobd,2024-09-24 18:31:14,,20,"[{'comment_id': 'lorh42m', 'comment_text': 'Th..."
4,Queen gambit,7,1foc7bl,https://www.reddit.com/r/queensgambit/comments...,2024-09-24 13:14:50,I just started watching it is it worth it?,9,"[{'comment_id': 'lopun1k', 'comment_text': 'Ye..."


In [20]:
comments_df.head()

Unnamed: 0,comment_id,comment_text,comment_score,comment_created,comment_author,author_karma
0,lq7ugjb,I recommend watching the Queens Gambit again,10,2024-10-03 23:19:30,SirZacharia,50248.0
1,lq24ob7,The Queen’s Gambit is soooooo good!! I recomm...,4,2024-10-02 23:48:00,Lost_As_Alice_,14735.0
2,lq3ot0r,"It’s not a series, but if you liked her (can’t...",3,2024-10-03 07:04:53,ButterflyBelleFL,2546.0
3,lqq8uv0,"I’m reading the book before watching the show,...",3,2024-10-07 04:06:15,I-Am-Baldy,981.0
4,lqaqh57,I like Searching for Bobby Fischer.\n\n-Seven-...,1,2024-10-04 13:31:55,Numerous-Editor-8778,228.0


## Access Subreddit 'Netflix' Query 'Queen's Gambit'

In [None]:
from langdetect import detect  # Import langdetect

# Define subreddit and search query
subreddit = reddit.subreddit('netflix')
query = 'Queen\'s Gambit'

# Scrape posts
posts = []
for submission in subreddit.search(query, limit=10000, sort='new'):
    try:
        # Detect the language of the post text
        if detect(submission.title + ' ' + submission.selftext) == 'en':
            posts.append(submission.title + ' ' + submission.selftext)
    except Exception as e:
        # In case of a detection error, skip the post
        print(f"Error detecting language: {e}")
        continue

In [38]:
from langdetect import detect  # Import langdetect

# Define subreddit and search query
subreddit = reddit.subreddit('netflix')
query = 'Queen\'s Gambit'

# Scrape posts
posts_data = []

# Initialize counters
post_counter = 0
comment_counter = 0

for submission in subreddit.search(query, limit=10000, sort='new'):
    try:
        # Detect the language of the post text
        if detect(submission.title + ' ' + submission.selftext) == 'en':
            try:

                # Convert the Unix timestamp to a timezone-aware datetime
                created_time = datetime.fromtimestamp(submission.created_utc, tz=timezone.utc)
                created_time_str = created_time.strftime('%Y-%m-%d %H:%M:%S')

                # Store post information
                post_data = {
                    "title": submission.title,
                    "score": submission.score,
                    "id": submission.id,
                    "url": submission.url,
                    "created": created_time_str,
                    "text": submission.selftext,
                    "num_comments": submission.num_comments,
                    "comments": []  # Will be filled with comment data
                }

                # Fetch comments
                submission.comments.replace_more(limit=0)  # Ensures all comments are fetched
                for comment in submission.comments.list():
                    comment_time = datetime.fromtimestamp(comment.created_utc, tz=timezone.utc)
                    comment_time_str = comment_time.strftime('%Y-%m-%d %H:%M:%S')

                    # Check if comment has an author
                    if comment.author:
                        author = comment.author.name
                        try:
                            author_karma = comment.author.comment_karma
                        except AttributeError:
                            author_karma = None
                    else:
                        author = "Deleted"
                        author_karma = None

                    # Store comment information
                    comment_data = {
                        "comment_id": comment.id,
                        "post_id": submission.id,  # Link comment to its parent post
                        "comment_text": comment.body,
                        "comment_score": comment.score,
                        "comment_created": comment_time_str,
                        "comment_author": author,
                        "author_karma": author_karma
                    }

                    # Append comment data to post's comment list
                    post_data["comments"].append(comment_data)

                    # Increment comment counter
                    comment_counter += 1

                # Append the post data with its comments to the main list
                posts_data.append(post_data)
                # Increment post counter
                post_counter += 1
                print(f"Processed {post_counter} posts and {comment_counter} comments (Total: {post_counter + comment_counter}).", end="\r")
            
            except RequestException as e:
                print(f"Rate limit error occurred: {e}")
                print("Waiting before retrying...")
                time.sleep(60)
            
            # Save the data to a JSON file
            with open("netflix_reddit_data.json", "w") as outfile:
                json.dump(posts_data, outfile, indent=4)
            
            # Sleep for 2 seconds to avoid hitting the Reddit API rate limit
            time.sleep(2)

    except Exception as e:
        # In case of a detection error, skip the post
        print(f"Error detecting language: {e}")
        continue

Error detecting language: received 429 HTTP response
Error detecting language: received 429 HTTP response
Error detecting language: received 429 HTTP response
Error detecting language: received 429 HTTP response
Error detecting language: received 429 HTTP response
Processed 138 posts and 3671 comments (Total: 3809).

In [39]:
# Load JSON data
with open("netflix_reddit_data.json", "r") as file:
    data = json.load(file)

# Normalize the JSON data
nf_posts_df = pd.json_normalize(data)

# Normalize comments data
comments_data = []
for post in data:
    for comment in post['comments']:
        #comment['post_id'] = post['post_id']
        comments_data.append(comment)

nf_comments_df = pd.DataFrame(comments_data)

## Access Subreddit 'Television' Query 'Queen's Gambit'

In [40]:
from langdetect import detect  # Import langdetect

# Define subreddit and search query
subreddit = reddit.subreddit('television')
query = 'Queen\'s Gambit'

# Scrape posts
posts_data = []

# Initialize counters
post_counter = 0
comment_counter = 0

for submission in subreddit.search(query, limit=10000, sort='new'):
    try:
        # Detect the language of the post text
        if detect(submission.title + ' ' + submission.selftext) == 'en':
            try:

                # Convert the Unix timestamp to a timezone-aware datetime
                created_time = datetime.fromtimestamp(submission.created_utc, tz=timezone.utc)
                created_time_str = created_time.strftime('%Y-%m-%d %H:%M:%S')

                # Store post information
                post_data = {
                    "title": submission.title,
                    "score": submission.score,
                    "id": submission.id,
                    "url": submission.url,
                    "created": created_time_str,
                    "text": submission.selftext,
                    "num_comments": submission.num_comments,
                    "comments": []  # Will be filled with comment data
                }

                # Fetch comments
                submission.comments.replace_more(limit=0)  # Ensures all comments are fetched
                for comment in submission.comments.list():
                    comment_time = datetime.fromtimestamp(comment.created_utc, tz=timezone.utc)
                    comment_time_str = comment_time.strftime('%Y-%m-%d %H:%M:%S')

                    # Check if comment has an author
                    if comment.author:
                        author = comment.author.name
                        try:
                            author_karma = comment.author.comment_karma
                        except AttributeError:
                            author_karma = None
                    else:
                        author = "Deleted"
                        author_karma = None

                    # Store comment information
                    comment_data = {
                        "comment_id": comment.id,
                        "post_id": submission.id,  # Link comment to its parent post
                        "comment_text": comment.body,
                        "comment_score": comment.score,
                        "comment_created": comment_time_str,
                        "comment_author": author,
                        "author_karma": author_karma
                    }

                    # Append comment data to post's comment list
                    post_data["comments"].append(comment_data)

                    # Increment comment counter
                    comment_counter += 1

                # Append the post data with its comments to the main list
                posts_data.append(post_data)
                # Increment post counter
                post_counter += 1
                print(f"Processed {post_counter} posts and {comment_counter} comments (Total: {post_counter + comment_counter}).", end="\r")
            
            except RequestException as e:
                print(f"Rate limit error occurred: {e}")
                print("Waiting before retrying...")
                time.sleep(60)
            
            # Save the data to a JSON file
            with open("television_reddit_data.json", "w") as outfile:
                json.dump(posts_data, outfile, indent=4)
            
            # Sleep for 2 seconds to avoid hitting the Reddit API rate limit
            time.sleep(2)

    except Exception as e:
        # In case of a detection error, skip the post
        print(f"Error detecting language: {e}")
        continue

Error detecting language: received 429 HTTP response
Error detecting language: received 429 HTTP response
Error detecting language: received 429 HTTP response
Error detecting language: received 429 HTTP response
Error detecting language: received 429 HTTP response
Error detecting language: received 429 HTTP response
Error detecting language: received 429 HTTP response
Error detecting language: received 429 HTTP response
Error detecting language: received 429 HTTP response
Error detecting language: received 429 HTTP response
Error detecting language: received 429 HTTP response
Error detecting language: received 429 HTTP response
Error detecting language: received 429 HTTP response
Error detecting language: received 429 HTTP response
Error detecting language: received 429 HTTP response
Error detecting language: received 429 HTTP response
Error detecting language: received 429 HTTP response
Processed 79 posts and 6483 comments (Total: 6562).

## Access Subreddit 'NetflixBestOf' Query 'Queen's Gambit'

In [None]:
from langdetect import detect  # Import langdetect

# Define subreddit and search query
subreddit = reddit.subreddit('NetflixBestOf')
query = 'Queen\'s Gambit'

# Scrape posts
posts_data = []

# Initialize counters
post_counter = 0
comment_counter = 0

for submission in subreddit.search(query, limit=10000, sort='new'):
    try:
        # Detect the language of the post text
        if detect(submission.title + ' ' + submission.selftext) == 'en':
            try:

                # Convert the Unix timestamp to a timezone-aware datetime
                created_time = datetime.fromtimestamp(submission.created_utc, tz=timezone.utc)
                created_time_str = created_time.strftime('%Y-%m-%d %H:%M:%S')

                # Store post information
                post_data = {
                    "title": submission.title,
                    "score": submission.score,
                    "id": submission.id,
                    "url": submission.url,
                    "created": created_time_str,
                    "text": submission.selftext,
                    "num_comments": submission.num_comments,
                    "comments": []  # Will be filled with comment data
                }

                # Fetch comments
                submission.comments.replace_more(limit=0)  # Ensures all comments are fetched
                for comment in submission.comments.list():
                    comment_time = datetime.fromtimestamp(comment.created_utc, tz=timezone.utc)
                    comment_time_str = comment_time.strftime('%Y-%m-%d %H:%M:%S')

                    # Check if comment has an author
                    if comment.author:
                        author = comment.author.name
                        try:
                            author_karma = comment.author.comment_karma
                        except AttributeError:
                            author_karma = None
                    else:
                        author = "Deleted"
                        author_karma = None

                    # Store comment information
                    comment_data = {
                        "comment_id": comment.id,
                        "post_id": submission.id,  # Link comment to its parent post
                        "comment_text": comment.body,
                        "comment_score": comment.score,
                        "comment_created": comment_time_str,
                        "comment_author": author,
                        "author_karma": author_karma
                    }

                    # Append comment data to post's comment list
                    post_data["comments"].append(comment_data)

                    # Increment comment counter
                    comment_counter += 1

                # Append the post data with its comments to the main list
                posts_data.append(post_data)
                # Increment post counter
                post_counter += 1
                print(f"Processed {post_counter} posts and {comment_counter} comments (Total: {post_counter + comment_counter}).", end="\r")
            
            except RequestException as e:
                print(f"Rate limit error occurred: {e}")
                print("Waiting before retrying...")
                time.sleep(60)
            
            # Save the data to a JSON file
            with open("nfbo_reddit_data.json", "w") as outfile:
                json.dump(posts_data, outfile, indent=4)
            
            # Sleep for 2 seconds to avoid hitting the Reddit API rate limit
            time.sleep(2)

    except Exception as e:
        # In case of a detection error, skip the post
        print(f"Error detecting language: {e}")
        continue

## Access Subreddit 'all' Query 'Queen's Gambit'