In [1]:
pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

In [None]:
# Ensure 'punkt' and 'stopwords' are downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Load dataset
data = pd.read_csv('/Users/zwl77/newdata/csv/BBC clean.csv') 

# Define a function to tokenize and remove stopwords
stop_words = set(stopwords.words('english'))
def tokenize_and_filter(text):
    if pd.isna(text):
        return []
    else:
        return [word.lower() for word in word_tokenize(text) if word.isalpha() and word.lower() not in stop_words]

# Apply tokenization to the 'Comment' column
data['tokens'] = data['Comment'].apply(tokenize_and_filter)

# Flatten the list of tokens into a single list
all_tokens = [token for sublist in data['tokens'] for token in sublist]

# Define a function to filter out unwanted bigrams
def is_unwanted_bigram(bigram):
    unwanted_bigrams = [('looks', 'like'), ('look', 'like')]
    return bigram in unwanted_bigrams

filtered_bigrams = [bigram for bigram in ngrams(all_tokens, 2) if not is_unwanted_bigram(bigram)]
bigrams = Counter(filtered_bigrams)

# Create word cloud for single words
wordcloud_words = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(Counter(all_tokens))
plt.figure(figsize=(15, 7))
plt.imshow(wordcloud_words, interpolation='bilinear')
plt.title('Word Frequency Word Cloud')
plt.axis('off')
plt.show()

# Display top 20 most common words
print("Top 20 Words:")
print(Counter(all_tokens).most_common(20))

# Generate and count trigrams
trigrams = Counter(ngrams(all_tokens, 3))

# Convert bigrams and trigrams to a format suitable for WordCloud
bigrams_for_wordcloud = {'_'.join(bigram): freq for bigram, freq in bigrams.items()}
trigrams_for_wordcloud = {'_'.join(trigram): freq for trigram, freq in trigrams.items()}

# Merge frequencies of bigrams and trigrams
combined_grams_for_wordcloud = bigrams_for_wordcloud.copy()
combined_grams_for_wordcloud.update(trigrams_for_wordcloud)

# Create word cloud for bigrams and trigrams
wordcloud_grams = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(combined_grams_for_wordcloud)
plt.figure(figsize=(15, 7))
plt.imshow(wordcloud_grams, interpolation='bilinear')
plt.title('Bigrams and Trigrams Frequency Word Cloud')
plt.axis('off')
plt.show()

# Display top 20 bigrams and trigrams
print("Top 20 Bigrams:")
print(bigrams.most_common(20))
print("Top 20 Trigrams:")
print(trigrams.most_common(20))

# Define themes and keywords for categorization
themes = {
    'conflict_and_regional_politics': [
        ('war', 'crimes'), ('war', 'crime'), ('controlled', 'demolition'), 
        ('middle', 'east'), ('west', 'bank'), ('israel', 'palestine'), 
        ('palestinian', 'people'), ('like', 'israel', 'palestine'), 
        ('like', 'controlled', 'demolition')
    ],
    'human_rights_and_social_issues': [
        ('innocent', 'people'), ('human', 'rights'), ('people', 'gaza'), 
        ('human', 'shields'), ('refugee', 'camp'), ('two', 'state', 'solution'), 
        ('free', 'free', 'palestine')
    ],
    'historical_and_cultural_references': [
        ('years', 'ago'), ('year', 'old'), ('sunan', 'abi', 'dawud'), 
        ('abi', 'dawud', 'sahih'), ('dawud', 'sahih', 'narrated'), 
        ('sahih', 'narrated', 'sunan'), ('narrated', 'sunan', 'abi')
    ],
    'blessings_and_support': [
        ('god', 'bless'), ('bless', 'israel'), ('support', 'israel'), 
        ('free', 'palestine'), ('palestine', 'free'), ('god', 'bless', 'israel'), 
        ('free', 'palestine', 'free'), ('palestine', 'free', 'palestine'), 
        ('israel', 'god', 'bless')
    ]
}

# Function for categorizing and counting bigrams and trigrams
def categorize_and_count(bigrams_counts, trigrams_counts, themes):
    # Initialize a dictionary to hold theme counts
    theme_counts = {theme: 0 for theme in themes}

    # Count bigrams and trigrams for each theme
  
    return theme_counts

# Count bigrams and trigrams according to themes
theme_counts = categorize_and_count(bigrams, trigrams, themes)

# Display the counts for each theme
for theme, count in theme_counts.items():
    print(f"{theme}: {count}")

# Create an advanced bar chart with customized appearance
# Include seaborn for better styling
import seaborn as sns

# Set the seaborn style
sns.set_theme(style="whitegrid")

# Assume theme_counts contains the frequencies from your previous calculations
# Extract themes and frequencies for plotting
themes = list(theme_counts.keys())
frequencies = list(theme_counts.values())

# Set figure size and seaborn style
plt.figure(figsize=(10, 6))

# Create the barplot and specify colors
bars = sns.barplot(x=frequencies, y=themes, palette="pastel")

# Add value labels to each bar
for p in bars.patches:
    width = p.get_width()  
    plt.text(5 + width, p.get_y() + p.get_height() / 2, 
             f'{int(width)}', 
             va = 'center')

# Add a custom title
plt.title('Frequency of Themes in Text Data', fontsize=16, weight='bold', pad=20)


