In [None]:
# 1. Importing necessary libraries
import nltk
from collections import Counter
from wordcloud import WordCloud
from textblob import TextBlob
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from src.preprocessing import clean_text, vectorize_text

# Ensure necessary nltk data is downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# 2. Data Loading

# Load the datasets
comments_data = pd.read_csv('../data/pse_isr_reddit_comments.csv')
opinion_data = pd.read_csv('../data/reddit_opinion_PSE_ISR.csv')

# Display the first few rows of each dataset
print("Comments Data:")
print(comments_data.head())
print("\nOpinion Data:")
print(opinion_data.head())

# 3. Initial Exploration

# Checking for missing values and basic info in both datasets
print("Comments Data Info:")
print(comments_data.info())
print("Opinion Data Info:")
print(opinion_data.info())

# Display basic statistics for both datasets
print("Comments Data Description:")
print(comments_data.describe())
print("Opinion Data Description:")
print(opinion_data.describe())

# 4. Preprocessing Integration

# Ensure all entries in 'self_text' are strings before cleaning
comments_data['self_text'] = comments_data['self_text'].astype(str)
opinion_data['self_text'] = opinion_data['self_text'].astype(str)

# Cleaning the text in both datasets
comments_data['cleaned_text'] = comments_data['self_text'].apply(clean_text)
opinion_data['cleaned_text'] = opinion_data['self_text'].apply(clean_text)

# Get word frequency across the entire dataset
word_freq = Counter(" ".join(comments_data['cleaned_text']).split())

# Remove rare words that appear less than a threshold number of times
min_occurrences = 5  # You can adjust this threshold
comments_data['cleaned_text'] = comments_data['cleaned_text'].apply(
    lambda text: " ".join([word for word in text.split() if word_freq[word] >= min_occurrences])
)
opinion_data['cleaned_text'] = opinion_data['cleaned_text'].apply(
    lambda text: " ".join([word for word in text.split() if word_freq[word] >= min_occurrences])
)

# 5. Split the data into training and testing sets
train_comments, test_comments = train_test_split(comments_data, test_size=0.3, random_state=42)
train_opinions, test_opinions = train_test_split(opinion_data, test_size=0.3, random_state=42)

# 6. Visualizations and Analysis (on training data)

# Distribution of Post Lengths in the training comments dataset
train_comments['post_length'] = train_comments['self_text'].apply(len)
plt.figure(figsize=(10, 6))
plt.hist(train_comments['post_length'], bins=50)
plt.title('Distribution of Post Lengths in Training Comments')
plt.xlabel('Length of Post')
plt.ylabel('Frequency')
plt.show()

# Word Cloud Visualization for training comments data
comments_text = ' '.join(train_comments['self_text'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(comments_text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Training Comments Text')
plt.show()

# Sentiment Analysis for training comments data
train_comments['sentiment'] = train_comments['self_text'].apply(lambda text: TextBlob(text).sentiment.polarity)

plt.figure(figsize=(10, 6))
sns.histplot(train_comments['sentiment'], bins=50, kde=True)
plt.title('Sentiment Polarity Distribution (Training Data)')
plt.xlabel('Sentiment Polarity')
plt.ylabel('Frequency')
plt.show()

# 7. Topic Modeling (on training data)

# Reduce vocabulary size and perform vectorization
count_vectorizer = CountVectorizer(max_df=0.9, min_df=5, max_features=5000, stop_words='english')
count_data = count_vectorizer.fit_transform(train_comments['self_text'])

# Reduce number of topics and fit LDA
lda = LatentDirichletAllocation(n_components=3, max_iter=10, random_state=42, learning_method='online')
lda.fit(count_data)

# Display the top 10 words for each topic
for i, topic in enumerate(lda.components_):
    print(f"Top 10 words for topic #{i}:")
    print([count_vectorizer.get_feature_names_out()[index] for index in topic.argsort()[-10:]])

# 8. Vectorization (on full datasets)

# Vectorize the cleaned text using TF-IDF in both datasets
X_train_comments = vectorize_text(train_comments['cleaned_text'])
X_test_comments = vectorize_text(test_comments['cleaned_text'])

X_train_opinions = vectorize_text(train_opinions['cleaned_text'])
X_test_opinions = vectorize_text(test_opinions['cleaned_text'])

# Display the shape of the resulting vectorized text data
print("Shape of Vectorized Training Comments Data:", X_train_comments.shape)
print("Shape of Vectorized Testing Comments Data:", X_test_comments.shape)
print("Shape of Vectorized Training Opinion Data:", X_train_opinions.shape)
print("Shape of Vectorized Testing Opinion Data:", X_test_opinions.shape)

[nltk_data] Downloading package punkt to /Users/ayl-
[nltk_data]     ecoplant/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ayl-
[nltk_data]     ecoplant/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ayl-
[nltk_data]     ecoplant/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
