# Sentiment Analysis Project - Part II (Data Preprocessing)

## Load Data

In [1]:
import json
import pandas as pd

# Load JSON data
with open("./data/reddit_data.json", "r") as file:
    data = json.load(file)

# Normalize the JSON data
posts_df = pd.json_normalize(data)

# Normalize comments data
comments_data = []
for post in data:
    for comment in post['comments']:
        #comment['post_id'] = post['post_id']
        comments_data.append(comment)

comments_df = pd.DataFrame(comments_data)

# Display DataFrames
posts_df.head()

Unnamed: 0,title,id,text,score,url,created,num_comments,comments
0,"Without question, Chloe was KGB! It was not su...",1i98h6m,"Chloe was “in bed”, pun intended, with the KGB...",41,https://www.reddit.com/r/queensgambit/comments...,2025-01-24 22:54:08,10,"[{'comment_id': 'm909dm2', 'post_id': '1i98h6m..."
1,No russian villanization,1i91w7h,I really love how this show didn't villanize r...,63,https://www.reddit.com/r/queensgambit/comments...,2025-01-24 18:11:37,1,"[{'comment_id': 'm9ctg8j', 'post_id': '1i91w7h..."
2,Matt and Mike’s acting in the Queens Gambit,1i6uc7f,Did anyone else think that the acting of the a...,0,https://www.reddit.com/r/queensgambit/comments...,2025-01-21 21:48:40,8,"[{'comment_id': 'm8fe51z', 'post_id': '1i6uc7f..."
3,Why did Mrs. Deardorff say “you should be in t...,1i6lye5,Why did Mrs. Deardorff say “you should be in t...,40,https://www.reddit.com/r/queensgambit/comments...,2025-01-21 16:04:34,13,"[{'comment_id': 'm8d9wxl', 'post_id': '1i6lye5..."
4,What makes Beth a loveable person?,1i45hpc,"She makes friends, well wishers everywhere she...",33,https://www.reddit.com/r/queensgambit/comments...,2025-01-18 11:18:07,6,"[{'comment_id': 'm7tg8u6', 'post_id': '1i45hpc..."


In [2]:
comments_df.head()

Unnamed: 0,comment_id,post_id,comment_text,comment_score,comment_created,comment_author,comment_author_karma
0,m909dm2,1i98h6m,I really don't think Alma was poisoned by the ...,79,2025-01-25 00:03:37,ungainlygay,29417.0
1,m940hnv,1i98h6m,I thought her name was Cleo ?,12,2025-01-25 16:26:11,Smthcool1,102.0
2,m90a0pp,1i98h6m,Interesting take. Alma would have been suscept...,9,2025-01-25 00:07:07,Fuertebrazos,2860.0
3,m8zz2do,1i98h6m,Your Alma theory is interesting. And sad. Mayb...,5,2025-01-24 23:08:41,YellowDaisySpider,2676.0
4,m91ewjy,1i98h6m,That’s a pretty neat theory,2,2025-01-25 04:08:15,SirZacharia,50258.0


In [3]:
print(f"Entries in posts_df: {posts_df.shape[0]}")
print(f"Entries in comments_df: {comments_df.shape[0]}")

Entries in posts_df: 711
Entries in comments_df: 17391


## Combine posts and comments df

In [4]:
posts_df.columns

Index(['title', 'id', 'text', 'score', 'url', 'created', 'num_comments',
       'comments'],
      dtype='object')

In [5]:
comments_df.columns

Index(['comment_id', 'post_id', 'comment_text', 'comment_score',
       'comment_created', 'comment_author', 'comment_author_karma'],
      dtype='object')

In [6]:
# Rename posts_df 'title' column to 'post_title', 'id' column to 'post_id'
posts_df.rename(columns={'title': 'post_title', 'id': 'post_id'}, inplace=True)
posts_df.columns

Index(['post_title', 'post_id', 'text', 'score', 'url', 'created',
       'num_comments', 'comments'],
      dtype='object')

In [7]:
# Rename comments_df 'comment_text' column to 'text', 'comment_score' column to 'score', 'comment_created' to 'created'
comments_df.rename(columns={'comment_text': 'text', 'comment_score': 'score', 'comment_created': 'created'}, inplace=True)
comments_df.columns

Index(['comment_id', 'post_id', 'text', 'score', 'created', 'comment_author',
       'comment_author_karma'],
      dtype='object')

In [8]:
# Remove 'comments' column from posts_df
posts_df.drop('comments', axis=1, inplace=True)

In [9]:
# Concatenate the 'posts_df' and 'comments_df' DataFrames
combined_df = pd.concat([posts_df, comments_df], ignore_index=True)
combined_df.columns

Index(['post_title', 'post_id', 'text', 'score', 'url', 'created',
       'num_comments', 'comment_id', 'comment_author', 'comment_author_karma'],
      dtype='object')

## Remove Empty Entries

In [10]:
# Remove rows where 'text' column is empty or only contains whitespace
combined_df = combined_df[combined_df['text'].str.strip().notna() & (combined_df['text'].str.strip() != '')]

In [11]:
print(f"Entries: {combined_df.shape[0]}")

Entries: 18101


## Remove Entries That Are Too Short

In [12]:
def clean_data(df, column):
    # Remove entries with no more than three words in "comment_text"
    df = df[df[column].str.split().str.len() > 15]
    return df

combined_df = clean_data(combined_df, 'text')

In [13]:
print(f"Entries: {combined_df.shape[0]}")

Entries: 10074


## Remove Duplicates

In [14]:
# Check if there are any duplicates in the 'text' column
combined_df[combined_df.duplicated(subset='text')]

Unnamed: 0,post_title,post_id,text,score,url,created,num_comments,comment_id,comment_author,comment_author_karma


In [15]:
# Remove duplicates in the 'text' column
#combined_df.drop_duplicates(subset='text', inplace=True)

In [16]:
print(f"Entries: {combined_df.shape[0]}")

Entries: 10074


## Preprocess Text Data

In [17]:
# Remove unnecessary characters from the 'text' column, tokenize the text, and remove stopwords
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')

def clean_and_remove_stopwords(text: str) -> list:
    # Clean the text first (remove URLs, mentions, special characters, etc.)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+|#\w+', '', text)  # Remove mentions and hashtags
    text = re.sub(r'[^\w\s,\'\.\?!]', '', text)  # Remove special characters
    text = re.sub(r'[\U00010000-\U0010FFFF]', '', text)  # Remove emojis (Unicode range)
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    
    # Tokenize the text after cleaning
    tokens = word_tokenize(text)
    
    # Remove stop words from tokens
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    
    return filtered_tokens

combined_df['tokens'] = combined_df['text'].apply(clean_and_remove_stopwords)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mstoe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
print(f"Entries: {combined_df.shape[0]}")

Entries: 10074


## Lemmatization

In [19]:
from nltk.stem import WordNetLemmatizer

# Download WordNet
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens: list) -> list:
    return [lemmatizer.lemmatize(word) for word in tokens]

combined_df['lemmatized_tokens'] = combined_df['tokens'].apply(lemmatize_tokens)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mstoe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
