# Sentiment Analysis Project - Part II (Data Preprocessing)

## Load Data

In [1]:
import json
import pandas as pd

# Load JSON data
with open("./oop_data.json", "r") as file:
    data = json.load(file)

# Normalize the JSON data
posts_df = pd.json_normalize(data)

# Normalize comments data
comments_data = []
for post in data:
    for comment in post['comments']:
        #comment['post_id'] = post['post_id']
        comments_data.append(comment)

comments_df = pd.DataFrame(comments_data)

# Display DataFrames
posts_df.head()

Unnamed: 0,post_id,title,text,score,created_time,author,author_karma,url,num_comments,subreddit,comments
0,1i98h6m,"Without question, Chloe was KGB! It was not su...","Chloe was “in bed”, pun intended, with the KGB...",43,2025-01-24 22:54:08,Reasonable-Buy9281,4393.0,https://www.reddit.com/r/queensgambit/comments...,10,queensgambit,"[{'post_id': '1i98h6m', 'comment_id': 'm909dm2..."
1,1i91w7h,No russian villanization,I really love how this show didn't villanize r...,66,2025-01-24 18:11:37,Emergency-Tie327,0.0,https://www.reddit.com/r/queensgambit/comments...,1,queensgambit,"[{'post_id': '1i91w7h', 'comment_id': 'm9ctg8j..."
2,1i6uc7f,Matt and Mike’s acting in the Queens Gambit,Did anyone else think that the acting of the a...,0,2025-01-21 21:48:40,moviequests,2.0,https://www.reddit.com/r/queensgambit/comments...,8,queensgambit,"[{'post_id': '1i6uc7f', 'comment_id': 'm8fe51z..."
3,1i6lye5,Why did Mrs. Deardorff say “you should be in t...,Why did Mrs. Deardorff say “you should be in t...,42,2025-01-21 16:04:34,moviequests,2.0,https://www.reddit.com/r/queensgambit/comments...,13,queensgambit,"[{'post_id': '1i6lye5', 'comment_id': 'm8d9wxl..."
4,1i45hpc,What makes Beth a loveable person?,"She makes friends, well wishers everywhere she...",32,2025-01-18 11:18:07,purelibran,8952.0,https://www.reddit.com/r/queensgambit/comments...,6,queensgambit,"[{'post_id': '1i45hpc', 'comment_id': 'm7tg8u6..."


In [2]:
comments_df.head()

Unnamed: 0,post_id,comment_id,text,score,created_time,author,author_karma,subreddit
0,1i98h6m,m909dm2,I really don't think Alma was poisoned by the ...,81,2025-01-25 00:03:37,ungainlygay,29417.0,queensgambit
1,1i98h6m,m940hnv,I thought her name was Cleo ?,11,2025-01-25 16:26:11,Smthcool1,102.0,queensgambit
2,1i98h6m,m90a0pp,Interesting take. Alma would have been suscept...,7,2025-01-25 00:07:07,Fuertebrazos,2861.0,queensgambit
3,1i98h6m,m8zz2do,Your Alma theory is interesting. And sad. Mayb...,6,2025-01-24 23:08:41,YellowDaisySpider,2677.0,queensgambit
4,1i98h6m,m91ewjy,That’s a pretty neat theory,2,2025-01-25 04:08:15,SirZacharia,50261.0,queensgambit


In [3]:
print(f"Entries in posts_df: {posts_df.shape[0]}")
print(f"Entries in comments_df: {comments_df.shape[0]}")

Entries in posts_df: 525
Entries in comments_df: 6656


## Combine posts and comments df

In [4]:
print(posts_df.columns.tolist())
print(comments_df.columns.tolist())

['post_id', 'title', 'text', 'score', 'created_time', 'author', 'author_karma', 'url', 'num_comments', 'subreddit', 'comments']
['post_id', 'comment_id', 'text', 'score', 'created_time', 'author', 'author_karma', 'subreddit']


In [5]:
# Remove 'comments' column from posts_df
posts_df.drop('comments', axis=1, inplace=True)

In [6]:
# Concatenate the 'posts_df' and 'comments_df' DataFrames
combined_df = pd.concat([posts_df, comments_df], ignore_index=True)
combined_df.columns

Index(['post_id', 'title', 'text', 'score', 'created_time', 'author',
       'author_karma', 'url', 'num_comments', 'subreddit', 'comment_id'],
      dtype='object')

In [7]:
# Resort columns
combined_df = combined_df[['post_id', 'comment_id', 'title', 'text', 'created_time', 'subreddit', 'score', 'num_comments', 'author', 'author_karma', 'url']]

In [8]:
print(f"Entries: {combined_df.shape[0]}")

Entries: 7181


## Remove Empty Entries

In [9]:
# Remove rows where 'text' column is empty or only contains whitespace
combined_df = combined_df[combined_df['text'].str.strip().notna() & (combined_df['text'].str.strip() != '')]

In [10]:
print(f"Entries: {combined_df.shape[0]}")

Entries: 7180


## Remove Entries That Are Too Short

In [11]:
def clean_data(df, column):
    # Remove entries with no more than three words in "comment_text"
    df = df[df[column].str.split().str.len() > 12]
    return df

combined_df = clean_data(combined_df, 'text')

In [12]:
print(f"Entries: {combined_df.shape[0]}")

Entries: 4923


## Remove Duplicates

In [13]:
# Check if there are any duplicates in the 'text' column
combined_df[combined_df.duplicated(subset='text')]

Unnamed: 0,post_id,comment_id,title,text,created_time,subreddit,score,num_comments,author,author_karma,url


In [48]:
# Remove duplicates in the 'text' column
#combined_df.drop_duplicates(subset='text', inplace=True)

In [None]:
print(f"Entries: {combined_df.shape[0]}")

## Preprocess Text Data

In [None]:
# Remove unnecessary characters from the 'text' column, tokenize the text, and remove stopwords
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')

def clean_and_remove_stopwords(text: str) -> list:
    # Clean the text first (remove URLs, mentions, special characters, etc.)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+|#\w+', '', text)  # Remove mentions and hashtags
    text = re.sub(r'[^\w\s,\'\.\?!]', '', text)  # Remove special characters
    text = re.sub(r'[\U00010000-\U0010FFFF]', '', text)  # Remove emojis (Unicode range)
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    
    # Tokenize the text after cleaning
    tokens = word_tokenize(text)
    
    # Remove stop words from tokens
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    
    return filtered_tokens

combined_df['tokens'] = combined_df['text'].apply(clean_and_remove_stopwords)


In [None]:
print(f"Entries: {combined_df.shape[0]}")

## Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer

# Download WordNet
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens: list) -> list:
    return [lemmatizer.lemmatize(word) for word in tokens]

combined_df['lemmatized_tokens'] = combined_df['tokens'].apply(lemmatize_tokens)