# Laptop Subreddit Scraper

This notebook is used for exploratory data analysis and visualization of the scraped comments from the laptop subreddit between 2022 and 2024.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the scraped data
data_path = '../data/processed/laptop_comments.csv'
comments_df = pd.read_csv(data_path)

# Display the first few rows of the dataframe
comments_df.head()

In [None]:
# Visualize the distribution of comment lengths
comments_df['comment_length'] = comments_df['comment'].apply(len)

plt.figure(figsize=(10, 6))
sns.histplot(comments_df['comment_length'], bins=30, kde=True)
plt.title('Distribution of Comment Lengths')
plt.xlabel('Comment Length')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Analyze the sentiment of comments
from textblob import TextBlob

comments_df['sentiment'] = comments_df['comment'].apply(lambda x: TextBlob(x).sentiment.polarity)

plt.figure(figsize=(10, 6))
sns.histplot(comments_df['sentiment'], bins=30, kde=True)
plt.title('Sentiment Analysis of Comments')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Save the cleaned data for future use
cleaned_data_path = '../data/processed/cleaned_laptop_comments.csv'
comments_df.to_csv(cleaned_data_path, index=False)
print('Cleaned data saved to:', cleaned_data_path)