In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import folium
from folium import plugins
from textblob import TextBlob
from wordcloud import WordCloud
from datetime import datetime, timedelta
import json

# Set plotting style
plt.style.use('seaborn')
sns.set_palette("husl")

# Configure pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

# Load the latest data files
data_dir = Path('../data')
analysis_dir = data_dir / 'analysis'

def get_latest_file(directory, pattern):
    files = list(directory.glob(pattern))
    if files:
        return max(files, key=lambda x: x.stat().st_mtime)
    return None

tweets_file = get_latest_file(data_dir, 'tweets_*.csv')
sentiment_file = get_latest_file(analysis_dir, 'sentiment_analysis_*.csv')

if tweets_file and sentiment_file:
    tweets_df = pd.read_csv(tweets_file)
    sentiment_df = pd.read_csv(sentiment_file)
    print(f"Loaded {len(tweets_df)} tweets and sentiment data")
else:
    print("No data files found. Please run the data collection script first.")


In [None]:
# Convert created_at to datetime
tweets_df['created_at'] = pd.to_datetime(tweets_df['created_at'])

# Group by date and count tweets
daily_activity = tweets_df.groupby(tweets_df['created_at'].dt.date).size()

# Plot daily activity
plt.figure(figsize=(15, 6))
daily_activity.plot(kind='line', marker='o')
plt.title('Daily Protest Tweet Activity')
plt.xlabel('Date')
plt.ylabel('Number of Tweets')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Analyze sentiment over time
sentiment_df['created_at'] = pd.to_datetime(sentiment_df['created_at'])
daily_sentiment = sentiment_df.groupby(sentiment_df['created_at'].dt.date)['textblob_polarity'].mean()

plt.figure(figsize=(15, 6))
daily_sentiment.plot(kind='line', marker='o')
plt.title('Daily Average Sentiment')
plt.xlabel('Date')
plt.ylabel('Average Sentiment Polarity')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Create word cloud of tweet content
text = ' '.join(tweets_df['text'])
wordcloud = WordCloud(
    width=1600, 
    height=800,
    background_color='white',
    max_words=100
).generate(text)

plt.figure(figsize=(20,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Common Words in Protest Tweets')
plt.show()

# Analyze hashtag frequency
hashtags = []
for tags in tweets_df['hashtags']:
    if isinstance(tags, str):
        hashtags.extend(eval(tags))

hashtag_freq = pd.Series(hashtags).value_counts()

plt.figure(figsize=(15, 6))
hashtag_freq.head(15).plot(kind='bar')
plt.title('Most Common Hashtags')
plt.xlabel('Hashtag')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Calculate total engagement
tweets_df['total_engagement'] = tweets_df['retweet_count'] + tweets_df['favorite_count']

# Get most engaging tweets
most_engaging = tweets_df.nlargest(10, 'total_engagement')[
    ['text', 'retweet_count', 'favorite_count', 'total_engagement']
]

print("Most Engaging Tweets:")
print(most_engaging.to_string())

# Plot engagement distribution
plt.figure(figsize=(15, 6))
plt.hist(tweets_df['total_engagement'], bins=50)
plt.title('Distribution of Tweet Engagement')
plt.xlabel('Total Engagement (Retweets + Favorites)')
plt.ylabel('Number of Tweets')
plt.yscale('log')  # Use log scale for better visualization
plt.grid(True)
plt.show()

# Analyze engagement by sentiment
plt.figure(figsize=(15, 6))
sns.boxplot(x='huggingface_label', y='total_engagement', data=pd.merge(tweets_df, sentiment_df))
plt.title('Tweet Engagement by Sentiment')
plt.xlabel('Sentiment')
plt.ylabel('Total Engagement')
plt.show()


In [None]:
# Create base map centered on Kenya
m = folium.Map(location=[-1.2921, 36.8219], zoom_start=7)

# Add marker cluster
marker_cluster = plugins.MarkerCluster().add_to(m)

# Merge tweet and sentiment data
merged_df = pd.merge(tweets_df, sentiment_df)

# Process each tweet with location data
for idx, row in merged_df.iterrows():
    if row['coordinates']:
        # Determine color based on sentiment
        color = 'gray'
        if row['huggingface_label'] == 'POS':
            color = 'green'
        elif row['huggingface_label'] == 'NEG':
            color = 'red'
        
        # Create popup content
        popup_content = f"""
        <b>Tweet:</b> {row['text']}<br>
        <b>Time:</b> {row['created_at']}<br>
        <b>Sentiment:</b> {row['huggingface_label']}<br>
        <b>Engagement:</b> {row['total_engagement']}
        """
        
        # Add marker
        folium.Marker(
            location=row['coordinates'],
            popup=folium.Popup(popup_content, max_width=300),
            icon=folium.Icon(color=color)
        ).add_to(marker_cluster)

# Add heatmap layer
heat_data = []
for idx, row in merged_df.iterrows():
    if row['coordinates']:
        # Weight by engagement
        weight = row['total_engagement']
        heat_data.append([row['coordinates'][0], row['coordinates'][1], weight])

plugins.HeatMap(heat_data).add_to(m)

# Add layer control
folium.LayerControl().add_to(m)

# Display map
m
