In [None]:
import pandas as pd
import re
from unidecode import unidecode
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from gensim import corpora
from gensim.models import LdaModel
from nltk.util import ngrams
from collections import Counter
import seaborn as sns
import numpy as np 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from gensim.models import TfidfModel

In [None]:
df = pd.read_json("FIRE_cleaned_data.json", lines=True)

# Sentimental Analysis

In [None]:
# Text Blob
def calculate_sentiment(tokens):
    text = ' '.join(tokens)  # Convert tokens back to a single string
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    subjectivity = blob.sentiment.subjectivity
    
    if polarity > 0:
        sentiment = "positive"
    elif polarity < 0:
        sentiment = "negative"
    else:
        sentiment = "neutral"
    return polarity, subjectivity,sentiment

# Apply the sentiment calculation function to each row
df[['textblob_polarity', 'textblob_subjectivity','textblob_sentiment']] = df['lemmatized_tokens'].apply(calculate_sentiment).apply(pd.Series)


In [None]:
# VADER 
analyzer = SentimentIntensityAnalyzer()

# Create a function to calculate sentiment scores
def get_sentiment_scores(tokens):
    # Combine the lemmatized tokens back into a text
    text = ' '.join(tokens)
    
    # Calculate sentiment scores
    sentiment = analyzer.polarity_scores(text)
    compound_score = sentiment['compound']
    
    if compound_score >= 0.05:
        sentiment_label = 'Positive'
    elif compound_score <= -0.05:
        sentiment_label = 'Negative'
    else:
        sentiment_label = 'Neutral'
    
    return compound_score, sentiment_label

# Calculate sentiment scores for each row and store them in new columns
df[['vader_compound_score', 'vader_sentiment']] = df['lemmatized_tokens'].apply(get_sentiment_scores).apply(pd.Series)


In [None]:
sentiment_counts = df.groupby('textblob_sentiment').size()
vader_sentiment_counts = df.groupby('vader_sentiment').size()

# Create a single graph for sentiment and Vader sentiment vs frequency
plt.figure(figsize=(8, 5))

# Plot sentiment counts
plt.barh(sentiment_counts.index, sentiment_counts.values, color='grey', label='TextBlob Sentiment')

# Plot Vader sentiment counts
plt.barh(vader_sentiment_counts.index, vader_sentiment_counts.values, color='olive', label='Vader Sentiment')

plt.xlabel('Count')
plt.ylabel('Sentiment')
plt.title('TextBlob Sentiment vs Vader Sentiment Frequencies')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Group data by sentiment and merge tokens
grouped = df.groupby('textblob_sentiment')['lemmatized_tokens'].apply(lambda x: [token for sublist in x for token in sublist])

# Count word frequencies in each sentiment group
word_counts = {sentiment: Counter(tokens) for sentiment, tokens in grouped.items()}

# Set up subplots
fig, axes = plt.subplots(1, 3, figsize=(10, 10))

# Create word clouds for each sentiment group and display in subplots
for i, (sentiment, counts) in enumerate(word_counts.items()):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(counts)
    axes[i].imshow(wordcloud, interpolation='bilinear')
    axes[i].set_title(f'{sentiment.capitalize()} Sentiment')
    axes[i].axis('off')

# Adjust layout spacing
plt.tight_layout()
plt.show()

In [None]:
# Group data by sentiment and merge tokens
grouped = df.groupby('vader_sentiment')['lemmatized_tokens'].apply(lambda x: [token for sublist in x for token in sublist])

# Count word frequencies in each sentiment group
word_counts = {sentiment: Counter(tokens) for sentiment, tokens in grouped.items()}

# Set up subplots
fig, axes = plt.subplots(1, 3, figsize=(10, 10))

# Create word clouds for each sentiment group and display in subplots
for i, (sentiment, counts) in enumerate(word_counts.items()):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(counts)
    axes[i].imshow(wordcloud, interpolation='bilinear')
    axes[i].set_title(f'{sentiment.capitalize()} Sentiment')
    axes[i].axis('off')

# Adjust layout spacing
plt.tight_layout()
plt.show()

In [None]:
grouped = df.groupby(['date', 'vader_sentiment']).size().unstack(fill_value=0)


grouped.plot(kind='bar', figsize=(10, 6))
plt.xlabel('Date')
plt.ylabel('Count')
plt.title('Sentiment Counts by Date')
plt.legend(title='Sentiment')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Sorting DataFrame by 'vader_compound_score'
df_sorted = df.sort_values(by='vader_compound_score', ascending=False)

# Retrieving the top 5 and bottom 5 comments with their corresponding titles, bodies, and scores
top_comments = df_sorted.nlargest(5, 'vader_compound_score')[['post_title', 'comment', 'vader_compound_score']]
bottom_comments = df_sorted.nsmallest(5, 'vader_compound_score')[['post_title', 'comment', 'vader_compound_score']]

# Printing the results
print("Top 5 Comments with Highest Compound Scores:")
print(top_comments)

print("\nBottom 5 Comments with Lowest Compound Scores:")
print(bottom_comments)

# Topic modelling

BOW method

In [None]:
dictionary_bow = corpora.Dictionary(df['lemmatized_tokens'])

# Create a corpus (bag of words representation) from the unigrams
corpus = [dictionary_bow.doc2bow(unigram) for unigram in df['lemmatized_tokens']]

# Build the LDA model
lda_model_bow = LdaModel(corpus, num_topics= 4, id2word=dictionary_bow, passes=15)

In [None]:
# Print the topics and their top terms
topics_bow = lda_model_bow.print_topics(num_words=50)
for topic in topics_bow:
    print(topic)

In [None]:
# Initialize an empty list to store the topics
topics__bow_list = []

# Parse the topics and extract terms
for topic_id, terms_line in topics_bow:
    # Extract terms
    terms = re.findall(r'"([^"]+)"', terms_line)
    
    # Append the list of terms to the topics_list
    topics__bow_list.append(terms)


# Create word clouds for each topic
for i, topic in enumerate(topics__bow_list):
    # Convert the list of terms into a space-separated string
    text = " ".join(topic)
    
    # Create a WordCloud object
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    
    # Display the WordCloud
    plt.figure(figsize=(5, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f"Topic {i} Word Cloud")
    plt.axis("off")
    plt.show()

TF-IDF method

In [None]:
dictionary_tf = corpora.Dictionary(df['lemmatized_tokens'])

# Create a corpus (bag of words representation) from the unigrams
corpus_bow = [dictionary_tf.doc2bow(unigram) for unigram in df['lemmatized_tokens']]

# Build the TF-IDF model
tfidf_model = TfidfModel(corpus_bow)

# Apply TF-IDF transformation to the bag of words corpus
corpus_tfidf = [tfidf_model[doc] for doc in corpus_bow]

# Build the LDA model using TF-IDF corpus
lda_model_tf = LdaModel(corpus_tfidf, num_topics=4, id2word=dictionary_tf, passes=15)

In [None]:
# Print the topics and their top terms
topics_tf = lda_model_tf.print_topics(num_words=50)
for topic in topics_tf:
    print(topic)

In [None]:
# Initialize an empty list to store the topics
topics_tf_list = []

# Parse the topics and extract terms
for topic_id, terms_line in topics_tf:
    # Extract terms
    terms = re.findall(r'"([^"]+)"', terms_line)
    
    # Append the list of terms to the topics_list
    topics_tf_list.append(terms)


# Create word clouds for each topic
for i, topic in enumerate(topics_tf_list):
    # Convert the list of terms into a space-separated string
    text = " ".join(topic)
    
    # Create a WordCloud object
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    
    # Display the WordCloud
    plt.figure(figsize=(5, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f"Topic {i} Word Cloud")
    plt.axis("off")
    plt.show()

Analysis of LDA model which was trained on BOW

Topic-0

In [None]:
# Filter rows based on the presence of specific words in lemmatized_tokens
desired_words = ["comment", "please", "need", "contact", "moderators"]
filtered_rows = df[df['lemmatized_tokens'].apply(lambda tokens: any(word in tokens for word in desired_words))]

# Create a new DataFrame with the filtered rows and selected columns
topic_0_df = filtered_rows[['post_title', 'comment', 'vader_compound_score', 'vader_sentiment']]

In [None]:
# Sorting DataFrame by 'vader_compound_score'
df_sorted = topic_0_df.sort_values(by='vader_compound_score', ascending=False)

# Retrieving the top 5 and bottom 5 comments with their corresponding titles, bodies, and scores
top_comments = df_sorted.nlargest(5, 'vader_compound_score')[['post_title', 'comment', 'vader_compound_score', 'vader_sentiment']]
bottom_comments = df_sorted.nsmallest(5, 'vader_compound_score')[['post_title', 'comment', 'vader_compound_score', 'vader_sentiment']]

# Printing the results
print("Top 5 Comments with Highest Compound Scores:")
print(top_comments)

print("\nBottom 5 Comments with Lowest Compound Scores:")
print(bottom_comments)

In [None]:
# Create a countplot to visualize sentiment distribution
plt.figure(figsize=(5, 5))
sns.set(style="whitegrid")
sns.countplot(x='vader_sentiment', data=topic_0_df)

# Add labels and title
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.title('Distribution of Sentiments in Comments')

# Show the plot
plt.show()

Topic-1

In [None]:
# Filter rows based on the presence of specific words in lemmatized_tokens
desired_words = ["car", "look", "drive", "love", "great"]
filtered_rows = df[df['lemmatized_tokens'].apply(lambda tokens: any(word in tokens for word in desired_words))]

# Create a new DataFrame with the filtered rows and selected columns
topic_0_df = filtered_rows[['post_title', 'comment', 'vader_compound_score', 'vader_sentiment']]

In [None]:
# Sorting DataFrame by 'vader_compound_score'
df_sorted = topic_0_df.sort_values(by='vader_compound_score', ascending=False)

# Retrieving the top 5 and bottom 5 comments with their corresponding titles, bodies, and scores
top_comments = df_sorted.nlargest(5, 'vader_compound_score')[['post_title', 'comment', 'vader_compound_score', 'vader_sentiment']]
bottom_comments = df_sorted.nsmallest(5, 'vader_compound_score')[['post_title', 'comment', 'vader_compound_score', 'vader_sentiment']]

# Printing the results
print("Top 5 Comments with Highest Compound Scores:")
print(top_comments)

print("\nBottom 5 Comments with Lowest Compound Scores:")
print(bottom_comments)

In [None]:
# Create a countplot to visualize sentiment distribution
plt.figure(figsize=(5, 5))
sns.set(style="whitegrid")
sns.countplot(x='vader_sentiment', data=topic_0_df)

# Add labels and title
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.title('Distribution of Sentiments in Comments')

# Show the plot
plt.show()

Topic-2

In [None]:
# Filter rows based on the presence of specific words in lemmatized_tokens
desired_words = ["tire", "work", "part", "issue", "engine"]
filtered_rows = df[df['lemmatized_tokens'].apply(lambda tokens: any(word in tokens for word in desired_words))]

# Create a new DataFrame with the filtered rows and selected columns
topic_0_df = filtered_rows[['post_title', 'comment', 'vader_compound_score', 'vader_sentiment']]

In [None]:
# Sorting DataFrame by 'vader_compound_score'
df_sorted = topic_0_df.sort_values(by='vader_compound_score', ascending=False)

# Retrieving the top 5 and bottom 5 comments with their corresponding titles, bodies, and scores
top_comments = df_sorted.nlargest(5, 'vader_compound_score')[['post_title', 'comment', 'vader_compound_score', 'vader_sentiment']]
bottom_comments = df_sorted.nsmallest(5, 'vader_compound_score')[['post_title', 'comment', 'vader_compound_score', 'vader_sentiment']]

# Printing the results
print("Top 5 Comments with Highest Compound Scores:")
print(top_comments)

print("\nBottom 5 Comments with Lowest Compound Scores:")
print(bottom_comments)

In [None]:
# Create a countplot to visualize sentiment distribution
plt.figure(figsize=(5, 5))
sns.set(style="whitegrid")
sns.countplot(x='vader_sentiment', data=topic_0_df)

# Add labels and title
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.title('Distribution of Sentiments in Comments')

# Show the plot
plt.show()

Topic-3

In [None]:
# Filter rows based on the presence of specific words in lemmatized_tokens
desired_words = ["better", "sell", "buy", "old", "money"]
filtered_rows = df[df['lemmatized_tokens'].apply(lambda tokens: any(word in tokens for word in desired_words))]

# Create a new DataFrame with the filtered rows and selected columns
topic_0_df = filtered_rows[['post_title', 'comment', 'vader_compound_score', 'vader_sentiment']]

In [None]:
# Sorting DataFrame by 'vader_compound_score'
df_sorted = topic_0_df.sort_values(by='vader_compound_score', ascending=False)

# Retrieving the top 5 and bottom 5 comments with their corresponding titles, bodies, and scores
top_comments = df_sorted.nlargest(5, 'vader_compound_score')[['post_title', 'comment', 'vader_compound_score', 'vader_sentiment']]
bottom_comments = df_sorted.nsmallest(5, 'vader_compound_score')[['post_title', 'comment', 'vader_compound_score', 'vader_sentiment']]

# Printing the results
print("Top 5 Comments with Highest Compound Scores:")
print(top_comments)

print("\nBottom 5 Comments with Lowest Compound Scores:")
print(bottom_comments)

In [None]:
# Create a countplot to visualize sentiment distribution
plt.figure(figsize=(5, 5))
sns.set(style="whitegrid")
sns.countplot(x='vader_sentiment', data=topic_0_df)

# Add labels and title
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.title('Distribution of Sentiments in Comments')

# Show the plot
plt.show()