In [None]:
import json
import re
import os
from bs4 import BeautifulSoup
import string
import nltk
from nltk.corpus import stopwords

# Ensure that the required NLTK data is downloaded (if not already)
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Step 1: Load the JSON data from a file
def load_post_data(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

# Step 2: Extract relevant fields (title and content)
def extract_post_content(post_data):
    title = post_data.get('title', "")
    content = post_data.get('content', "")
    return title, content

# Step 3: Preprocess the text
def preprocess_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Optionally, remove stopwords
    # text = " ".join([word for word in text.split() if word not in stop_words])
    
    return text

# Step 4: Process all JSON files in the directory and save the output to a new JSON file
def process_all_posts(posts_dir, output_file):
    all_posts = []  # List to store all preprocessed posts
    
    for file_name in os.listdir(posts_dir):
        if file_name.endswith(".json"):
            file_path = os.path.join(posts_dir, file_name)
            
            # Load and preprocess the post data
            post_data = load_post_data(file_path)
            title, content = extract_post_content(post_data)
            preprocessed_title = preprocess_text(title)
            preprocessed_content = preprocess_text(content)
            
            # Store the preprocessed data in a dictionary
            post_entry = {
                'Post ID': file_name,
                'Title': preprocessed_title,
                'Content': preprocessed_content
            }
            all_posts.append(post_entry)
    
    # Save all the preprocessed posts into a JSON file
    with open(output_file, 'w') as output:
        json.dump(all_posts, output, indent=4)
        
    print(f"Preprocessing complete. The results are saved in {output_file}")

# Example usage:
posts_dir = 'posts/'  # Directory where the JSON files are located
output_file = 'preprocessed_posts.json'  # Output JSON file to save preprocessed content

# Process all posts and save to a JSON file
process_all_posts(posts_dir, output_file)

In [None]:
!pip install wordcloud matplotlib

In [None]:
import json
import re
from collections import Counter
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Ensure that the required NLTK data is downloaded (if not already)
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Step 1: Load the JSON file with preprocessed posts
def load_preprocessed_json(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

# Step 2: Extract text from the preprocessed posts
def extract_text_from_posts(posts):
    all_text = ""
    for post in posts:
        title = post.get("Title", "")
        content = post.get("Content", "")
        all_text += title + " " + content + " "  # Combine title and content
    return all_text

# Step 3: Tokenize, remove stopwords, and count word frequencies
def count_word_frequencies(text):
    # Remove punctuation and split the text into words
    words = re.findall(r'\b\w+\b', text.lower())
    
    # Filter out stopwords
    words_filtered = [word for word in words if word not in stop_words]
    
    # Count the frequency of each word
    word_count = Counter(words_filtered)
    
    return word_count

# Step 4: Build a word cloud based on the top 100 word frequencies
def build_word_cloud_from_top_words(word_frequencies, top_n=100, output_file='wordcloud.png'):
    # Select the top N most common words
    top_words = dict(word_frequencies.most_common(top_n))
    
    # Generate the word cloud from the top words
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(top_words)
    
    # Save the word cloud as an image file
    wordcloud.to_file(output_file)
    
    # Display the word cloud using matplotlib
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")  # Hide the axes
    plt.show()


# Example usage
preprocessed_json_file = ''  # Path to the preprocessed JSON file

# Load the preprocessed posts
preprocessed_posts = load_preprocessed_json(preprocessed_json_file)

# Extract all text from titles and content
combined_text = extract_text_from_posts(preprocessed_posts)

# Count the word frequencies (after removing stopwords)
word_frequencies = count_word_frequencies(combined_text)

# Build and display the word cloud based on the top 100 word frequencies
build_word_cloud_from_top_words(word_frequencies, top_n=100, output_file='wordcloud.png')

In [None]:
import json
import re
from collections import Counter

# Load JSON
with open("", "r") as f:
    data = json.load(f)

# Combine all text
all_text = []
for post in data:
    all_text.append(post.get("Title", ""))
    all_text.append(post.get("Content", ""))

text = " ".join(all_text).lower()

# Tokenize and clean
words = re.findall(r'\b[a-z0-9]+\b', text)

# (Optional) Remove very common English stopwords
stopwords = set([
    'the', 'and', 'for', 'you', 'with', 'that', 'this', 'are', 'was', 'not', 'but', 'have', 'can', 'all',
    'your', 'use', 'has', 'from', 'get', 'any', 'will', 'just', 'what', 'how', 'out', 'they', 'one', 'had',
    'when', 'use', 'then', 'which', 'who', 'where', 'why', 'about', 'would', 'should', 'could', 'does',
    'did', 'been', 'also', 'more', 'now', 'into', 'some', 'their', 'other', 'than', 'our', 'over', 'such',
    'each', 'new', 'per', 'may', 'does', 'doing', 'done', 'like', 'if', 'as', 'so', 'on', 'in', 'to', 'of',
    'is', 'it', 'at', 'by', 'an', 'be', 'or', 'a', 'i', 'we', 'my', 'me', 'he', 'she', 'him', 'her', 'them',
    'his', 'hers', 'its', 'were', 'because', 'very', 'there', 'too', 'no', 'yes', 'up', 'down'
])

filtered_words = [w for w in words if w not in stopwords]

# Count
counter = Counter(filtered_words)
top_300 = counter.most_common(300)

# Print top 300
for word, count in top_300:
    print(f"{word}\t{count}")


In [None]:
import json
import re
import nltk
import gensim
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models import Phrases
from gensim.models.ldamodel import LdaModel
from gensim.models.phrases import Phraser

# Ensure NLTK stopwords are downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Step 1: Load the JSON file with preprocessed posts
def load_preprocessed_json(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

# Step 2: Preprocess the text, including tokenization and stopword removal
def preprocess_text(text):
    # Remove punctuation and tokenize the text
    words = re.findall(r'\b\w+\b', text.lower())
    
    # Remove stopwords
    words_filtered = [word for word in words if word not in stop_words]
    
    return words_filtered

# Step 3: Create unigrams and bigrams
def generate_ngrams(texts):
    # Create bigrams using gensim's Phrases
    bigram = Phrases(texts, min_count=5, threshold=100)  # Adjust the parameters as needed
    bigram_mod = Phraser(bigram)
    
    # Apply the bigram model to the tokenized texts
    bigram_texts = [bigram_mod[text] for text in texts]
    
    return bigram_texts

# Step 4: Prepare LDA inputs (dictionary and corpus)
def prepare_corpus(texts):
    # Create a dictionary representation of the documents
    dictionary = corpora.Dictionary(texts)
    
    # Filter out rare and common tokens (optional, adjust parameters as needed)
    dictionary.filter_extremes(no_below=5, no_above=0.5)
    
    # Create the Bag-of-Words (BoW) corpus
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    return dictionary, corpus

# Step 5: Build the LDA model
def build_lda_model(corpus, dictionary, num_topics=5):
    lda_model = LdaModel(corpus=corpus,
                         id2word=dictionary,
                         num_topics=num_topics,  # Adjust number of topics
                         random_state=42,
                         update_every=1,
                         passes=10,
                         alpha='auto',
                         per_word_topics=True)
    return lda_model

# Example usage
preprocessed_json_file = ''  # Path to the preprocessed JSON file

# Step 1: Load the preprocessed posts
preprocessed_posts = load_preprocessed_json(preprocessed_json_file)

# Step 2: Extract all content (titles + content)
all_texts = [post.get('Title', '') + " " + post.get('Content', '') for post in preprocessed_posts]

# Step 3: Preprocess the text data
tokenized_texts = [preprocess_text(text) for text in all_texts]

# Step 4: Generate unigrams and bigrams
bigram_texts = generate_ngrams(tokenized_texts)

# Step 5: Prepare the corpus for LDA
dictionary, corpus = prepare_corpus(bigram_texts)

# Step 6: Build the LDA model
lda_model = build_lda_model(corpus, dictionary, num_topics=20)

# Step 7: Display the topics discovered by the LDA model
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")