# SC4021 - Data Collection, Cleaning and Analysis

## Reddit Data Collection

In [1]:
# Importing the necessary libraries
import praw  # Python Reddit API Wrapper
import pandas as pd
import os

In [2]:
# Define the constants and the Reddit instance

# Create a Reddit instance
reddit = praw.Reddit(user_agent=True, client_id='kqB2Mfaq32Jax9LAmdsr3A',
                     client_secret='7HR4TNjSVDXZrgEwlsrjF0Pcwzdc2w', username='KrisCholakov',
                     password='kyhnoh-pixci0-Bedgit')

# Define the columns for the submissions and comments dataframes
submission_columns = ["author", "created_utc", "distinguished", "id", "name", "num_comments", "score", "selftext", "title", "upvote_ratio", "url"]
comment_columns = ["author", "body", "body_html", "created_utc", "distinguished", "id", "link_id", "parent_id", "score"]

# Define the directory to save the data for the subreddits
data_directory = "subreddits"

In [3]:
# Define the functions to craw the data from Reddit

# Function to get the submissions and the corresponding comments for a subreddit
def get_submissions_and_comments(reddit, subreddit_name, limit=None):
    # Create the lists to store the submissions and comments
    submissions_list = []
    comments_list = []
    
    # Define the counter for the submissions and comments
    submission_cnt, comment_cnt = 1, 1
    
    # Browse the submissions
    for submission in reddit.subreddit(subreddit_name).top(limit=limit):
        # Print the progress - submission title, submission cnt
        print(f'{subreddit_name}-{submission_cnt}', submission.title)
        # Define the submission
        new_submission = {
        "author": submission.author,
        "created_utc": submission.created_utc,
        "distinguished": submission.distinguished,
        "id": submission.id,
        "name": submission.name,
        "num_comments": submission.num_comments,
        "score": submission.score,
        "selftext": submission.selftext,
        "title": submission.title,
        "upvote_ratio": submission.upvote_ratio,
        "url": submission.url
    }
        # Add the submission to the list
        submissions_list.append(new_submission)
        # Get the comments
        submission.comments.replace_more(limit=0)
        # Browse the comments
        for comment in submission.comments.list():
            # Print the progress - comment cnt
            print(f'comment #{comment_cnt}')
            # Define the comment
            new_comment = {
            "author": comment.author,
            "body": comment.body,
            "body_html": comment.body_html,
            "created_utc": comment.created_utc,
            "distinguished": comment.distinguished,
            "id": comment.id,
            "link_id": comment.link_id,
            "parent_id": comment.parent_id,
            "score": comment.score
        }
            # Add the comment to the list
            comments_list.append(new_comment)
            comment_cnt += 1
        submission_cnt += 1

    # Convert the lists to dataframes
    submissions = pd.DataFrame(submissions_list, columns=submission_columns)
    comments = pd.DataFrame(comments_list, columns=comment_columns)
    
    return submissions, comments

In [4]:
# Define the functions to save the data to csv files in the corresponding directory

# Function to save the submissions and comments to csv files
def save_submissions_and_comments(submissions, comments, subreddit_name):
    # Create the directory if it does not exist
    if not os.path.exists(f'{data_directory}/{subreddit_name}'):
        os.makedirs(f'{data_directory}/{subreddit_name}')
    
    # Save the submissions to a csv file
    submissions.to_csv(f'{data_directory}/{subreddit_name}/submissions.csv', index=False)
    # Save the comments to a csv file
    comments.to_csv(f'{data_directory}/{subreddit_name}/comments.csv', index=False)
    
# Function to check if the subreddit directory exists and if the data is already collected
def check_subreddit_data(subreddit_name):
    # Check if the subreddit directory exists
    if not os.path.exists(f'{data_directory}/{subreddit_name}'):
        return False
    # Check if the submissions and comments csv files exist
    if not os.path.exists(f'{data_directory}/{subreddit_name}/submissions.csv') or not os.path.exists(f'{data_directory}/{subreddit_name}/comments.csv'):
        return False
    
    return True

In [5]:
# Define the subreddits to crawl the data from
subreddits = ["VisionPro", "virtualreality", "augmentedreality", "MetaQuestVR", "oculus", "OculusQuest"]

# Crawl the data for the subreddits
for subreddit_name in subreddits:
    # Check if the data is already collected
    if check_subreddit_data(subreddit_name):
        continue
    # Get the submissions and comments
    submissions, comments = get_submissions_and_comments(reddit, subreddit_name, limit=1000)
    # Save the data to csv files
    save_submissions_and_comments(submissions, comments, subreddit_name)

## Analyzing the data

In [6]:
# Importing the necessary libraries
import matplotlib.pyplot as plt

In [7]:
# Define the functions to load the data and analyze it

# Function to load the submissions and comments dataframes
def load_submissions_and_comments(subreddit_name):
    # Load the submissions and comments dataframes
    submissions = pd.read_csv(f'{data_directory}/{subreddit_name}/submissions.csv')
    comments = pd.read_csv(f'{data_directory}/{subreddit_name}/comments.csv')
    
    return submissions, comments

def simple_analyze_submissions_and_comments(submissions, comments):
    # Create a dictionary to store the results
    results = {}

    # Calculate and store the results in the dictionary
    results["Number of submissions"] = len(submissions)
    results["Number of comments"] = len(comments)
    results["Number of unique authors in submissions"] = len(submissions["author"].unique())
    results["Number of unique authors in comments"] = len(comments["author"].unique())
    results["Number of unique submissions"] = len(submissions["id"].unique())
    results["Number of unique comments"] = len(comments["id"].unique())
    comments["word_length"] = comments["body"].apply(lambda x: len(str(x).split()))
    results["Average word length of comments"] = comments["word_length"].mean()
    results["Number of comments that have more than 50 words"] = len(comments[comments["word_length"] > 50])
    results["Number of submissions that have more than 50 words in the selftext"] = len(submissions[submissions["selftext"].apply(lambda x: len(str(x).split())) > 50])
    results["Average score of submissions"] = submissions["score"].mean()
    results["Average score of comments"] = comments["score"].mean()
    results["Average number of comments per submission"] = submissions["num_comments"].mean()

    # Convert the dictionary to a DataFrame
    results = pd.DataFrame(list(results.items()), columns=['Description', 'Data'])
    
    return results

In [8]:
# Perform analysis on the combined data for all subreddits

# Load the data for the subreddits and analyze it
submissions_list = []
comments_list = []
for subreddit_name in subreddits:
    # Load the submissions and comments dataframes
    submissions, comments = load_submissions_and_comments(subreddit_name)
    # Add the subreddit name to the submissions and comments dataframes
    submissions["subreddit"] = subreddit_name
    comments["subreddit"] = subreddit_name
    # Add the submissions and comments to the lists
    submissions_list.append(submissions)
    comments_list.append(comments)

# Concatenate the submissions and comments dataframes
all_submissions = pd.concat(submissions_list)
all_comments = pd.concat(comments_list)

# Simple analyze the data
simple_analyze_submissions_and_comments(all_submissions, all_comments)

Unnamed: 0,Description,Data
0,Number of submissions,5990.0
1,Number of comments,635630.0
2,Number of unique authors in submissions,3875.0
3,Number of unique authors in comments,120496.0
4,Number of unique submissions,5990.0
5,Number of unique comments,635629.0
6,Average word length of comments,28.200571
7,Number of comments that have more than 50 words,91634.0
8,Number of submissions that have more than 50 w...,752.0
9,Average score of submissions,1068.839232


In [9]:
# Show the users with most comments
all_comments["author"].value_counts()

author
OXIOXIOXI            1789
Theknyt              1224
SvenViking           1179
JorgTheElder         1032
ILoveRegenHealth      753
                     ... 
Ok_Pipe9153             1
NotDominusGhaul         1
iamnotshook             1
BryanwithaYnotanI       1
Raj3d                   1
Name: count, Length: 120495, dtype: int64

In [10]:
# Show the users with most submissions
all_submissions["author"].value_counts()

author
AR_MR_XR            57
acetylan            36
AugmentedThinker    33
SpatialComputing    32
Malkmus1979         32
                    ..
pampas93             1
lenanena             1
fx_mania             1
jbroadway            1
Rollertoaster7       1
Name: count, Length: 3874, dtype: int64

In [11]:
# Show the most common comments
all_comments["body"].value_counts()

body
[Ð¸Ð·Ñ‚Ñ€Ð¸Ñ‚Ð¾]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

In [12]:
# Show the most common comments with more than 10 words and more than 1 occurrence add the comment ID too
all_repeated_comments = all_comments[all_comments["body"].apply(lambda x: len(str(x).split()) >= 10)]["body"].value_counts()[all_comments[all_comments["body"].apply(lambda x: len(str(x).split()) >= 10)]["body"].value_counts() > 1]
all_repeated_comments

body
Our automod detected strong language being used. Please consider rewriting your comment to something more polite. If this is an error, please don't hesitate to reach out to us.\n\n*I am a bot, and this action was performed automatically. Please [contact the moderators of this subreddit](/message/compose/?to=/r/oculus) if you have any questions or concerns.*                                                                                                                 89
#I no longer allow Reddit to profit from my content - Mass exodus 2023 -- mass edited with https://redact.dev/                                                                                                                                                                                                                                                                                                                                                                          81
Hey there!  \nI am so excited to tell you tha

## Data Cleaning

In [13]:
# Importing the necessary libraries


Comments Cleaning

In [14]:
# Define the function to clean the comments
def clean_comments(subreddits, min_word_count):
    subreddits_data = {}
    for subreddit_name in subreddits:
        # Load the submissions and comments dataframes
        submissions, comments = load_submissions_and_comments(subreddit_name)
        # Add the submissions and comments dataframes to the dictionary
        subreddits_data[subreddit_name] = {"submissions": submissions, "comments": comments}
        
    # Loop through the subreddits
    for subreddit_name, data in subreddits_data.items():
        initial_count = len(data["comments"])

        # Remove the comments that occur in all_repeated_comments and have more than 3 occurrences
        data["comments"] = data["comments"][~data["comments"]["body"].isin(all_repeated_comments[all_repeated_comments > 3].index)]
        print(f"{subreddit_name}: Removed {initial_count - len(data['comments'])} comments that occur in all_repeated_comments and have more than 3 occurrences")
        initial_count = len(data["comments"])

        # Clear the comments with less than min_word_count words
        data["comments"] = data["comments"][data["comments"]["body"].apply(lambda x: len(str(x).split()) >= min_word_count)]
        print(f"{subreddit_name}: Removed {initial_count - len(data['comments'])} comments with less than {min_word_count} words")
        initial_count = len(data["comments"])

        # Remove duplicated comments with same body and author
        data["comments"] = data["comments"].drop_duplicates(subset=["body", "author"])
        print(f"{subreddit_name}: Removed {initial_count - len(data['comments'])} duplicated comments")
        initial_count = len(data["comments"])

        # Remove comments with high percentage of special characters
        data["comments"] = data["comments"][data["comments"]["body"].apply(lambda x: len([c for c in str(x) if not c.isalnum()]) / len(str(x)) < 0.5)]
        print(f"{subreddit_name}: Removed {initial_count - len(data['comments'])} comments with high percentage of special characters")

        # Save the cleaned comments to a csv file
        data["comments"].to_csv(f'{data_directory}/{subreddit_name}/comments_cleaned_{min_word_count}.csv', index=False)

        # Print the number of comments after cleaning
        print(f"{subreddit_name}: Number of comments after cleaning: {len(data['comments'])}")
        print()

    # Print the total number of comments after cleaning
    total_comments = sum([len(data["comments"]) for data in subreddits_data.values()])
    print(f"Total number of comments after cleaning: {total_comments}")

    # Print the number of comments that were removed using
    print(f"Number of comments that were removed using all_comments: {len(all_comments) - total_comments}")

In [15]:
# Clean the comments for all subreddits

# Clean with min_word_count = 10
clean_comments(subreddits, 10)
# Clean with min_word_count = 50
clean_comments(subreddits, 50)

VisionPro: Removed 40 comments that occur in all_repeated_comments and have more than 3 occurrences
VisionPro: Removed 23764 comments with less than 10 words
VisionPro: Removed 111 duplicated comments
VisionPro: Removed 5 comments with high percentage of special characters
VisionPro: Number of comments after cleaning: 58786

virtualreality: Removed 156 comments that occur in all_repeated_comments and have more than 3 occurrences
virtualreality: Removed 44536 comments with less than 10 words
virtualreality: Removed 107 duplicated comments
virtualreality: Removed 11 comments with high percentage of special characters
virtualreality: Number of comments after cleaning: 104832

augmentedreality: Removed 8 comments that occur in all_repeated_comments and have more than 3 occurrences
augmentedreality: Removed 3819 comments with less than 10 words
augmentedreality: Removed 18 duplicated comments
augmentedreality: Removed 0 comments with high percentage of special characters
augmentedreality: N

In [19]:
# Combine the cleaned comments for all subreddits
cleaned_comments_list = []
min_word_count = 50
for subreddit_name in subreddits:
    # Load the cleaned comments
    cleaned_comments = pd.read_csv(f'{data_directory}/{subreddit_name}/comments_cleaned_{min_word_count}.csv')
    # Add the subreddit name to the cleaned comments
    cleaned_comments["subreddit"] = subreddit_name
    # Add the cleaned comments to the list
    cleaned_comments_list.append(cleaned_comments)

# Concatenate all the cleaned comments into a single DataFrame
all_cleaned_comments = pd.concat(cleaned_comments_list)

# Save the cleaned comments to a csv file
all_cleaned_comments.to_csv(f'{data_directory}/all/comments_cleaned_{min_word_count}.csv', index=False)

VisionPro: Number of cleaned comments: 14099
virtualreality: Number of cleaned comments: 25290
augmentedreality: Number of cleaned comments: 1382
MetaQuestVR: Number of cleaned comments: 1804
oculus: Number of cleaned comments: 28730
OculusQuest: Number of cleaned comments: 22855


In [20]:
# Load the cleaned comments for all subreddits
cleaned_comments = pd.read_csv(f'{data_directory}/all/comments_cleaned_{min_word_count}.csv')

# Print the number of comments 
len(cleaned_comments)

94160