In [None]:
# 1. Importing necessary libraries
import nltk
nltk.download('punkt')
from collections import Counter
from nltk.internals import Counter
from wordcloud import WordCloud
from textblob import TextBlob
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from src.preprocessing import clean_text, vectorize_text

# 2. Data Loading

# Load the first dataset
comments_data = pd.read_csv('../data/pse_isr_reddit_comments.csv')
# Load the second dataset
opinion_data = pd.read_csv('../data/reddit_opinion_PSE_ISR.csv')

# Display the first few rows of each dataset
print("Comments Data:")
print(comments_data.head())
print("\nOpinion Data:")
print(opinion_data.head())

# 3. Initial Exploration

# Checking for missing values and basic info in both datasets
print("Comments Data Info:")
print(comments_data.info())
print("Opinion Data Info:")
print(opinion_data.info())

# Display basic statistics for both datasets
print("Comments Data Description:")
print(comments_data.describe())
print("Opinion Data Description:")
print(opinion_data.describe())

# 4. Visualizations

## Example: Distribution of Post Lengths in the comments dataset
# Handle missing values in 'self_text' by filling with an empty string
comments_data['self_text'] = comments_data['self_text'].fillna('')

# Ensure all entries in 'self_text' are strings
comments_data['self_text'] = comments_data['self_text'].astype(str)

# Apply the length function safely
comments_data['post_length'] = comments_data['self_text'].apply(len)
plt.figure(figsize=(10, 6))
plt.hist(comments_data['post_length'], bins=50)
plt.title('Distribution of Post Lengths in Comments')
plt.xlabel('Length of Post')
plt.ylabel('Frequency')
plt.show()

# Word Cloud Visualization for comments data
comments_text = ' '.join(comments_data['self_text'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(comments_text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Comments Text')
plt.show()

# Sentiment Analysis for comments data
comments_data['sentiment'] = comments_data['self_text'].apply(lambda text: TextBlob(text).sentiment.polarity)

plt.figure(figsize=(10, 6))
sns.histplot(comments_data['sentiment'], bins=50, kde=True)
plt.title('Sentiment Polarity Distribution')
plt.xlabel('Sentiment Polarity')
plt.ylabel('Frequency')
plt.show()

# Topic Modeling for comments data
count_vectorizer = CountVectorizer(max_df=0.9, min_df=2, stop_words='english')
count_data = count_vectorizer.fit_transform(comments_data['self_text'])

lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(count_data)

for i, topic in enumerate(lda.components_):
    print(f"Top 10 words for topic #{i}:")
    print([count_vectorizer.get_feature_names_out()[index] for index in topic.argsort()[-10:]])

# 5. Preprocessing Integration
# Cleaning the text in the comments dataset
comments_data['cleaned_text'] = comments_data['self_text'].apply(clean_text)
opinion_data['cleaned_text'] = opinion_data['self_text'].apply(clean_text)

# Get word frequency across the entire dataset
word_freq = Counter(" ".join(comments_data['cleaned_text']).split())

# Remove rare words that appear less than a threshold number of times
min_occurrences = 5  # You can adjust this threshold
comments_data['cleaned_text'] = comments_data['cleaned_text'].apply(
    lambda text: " ".join([word for word in text.split() if word_freq[word] >= min_occurrences])
)
opinion_data['cleaned_text'] = opinion_data['cleaned_text'].apply(
    lambda text: " ".join([word for word in text.split() if word_freq[word] >= min_occurrences])
)


# Vectorize the cleaned text using TF-IDF in both datasets
X_comments = vectorize_text(comments_data['cleaned_text'])
X_opinion = vectorize_text(opinion_data['cleaned_text'])

# Display the shape of the resulting vectorized text data
print("Shape of Vectorized Comments Data:", X_comments.shape)
print("Shape of Vectorized Opinion Data:", X_opinion.shape)


[nltk_data] Downloading package punkt to /Users/ayl-
[nltk_data]     ecoplant/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
