In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import re
import json

# Set style for visualizations
plt.style.use('seaborn')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['font.size'] = 12

# Download required NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Custom color palette for Kenya-themed visualizations
kenya_colors = ['#BE0027', '#000000', '#169B62', '#FFFFFF']

def load_data():
    """Load and combine data from both platforms"""
    # Find the latest data files
    twitter_files = glob.glob('../data/twitter_data_*.csv')
    linkedin_files = glob.glob('../data/linkedin_posts_*.csv')
    manual_files = glob.glob('../data/manual_collection_*.csv')
    
    dfs = []
    
    # Load Twitter data
    if twitter_files:
        twitter_df = pd.read_csv(max(twitter_files))
        twitter_df['platform'] = 'Twitter'
        dfs.append(twitter_df)
    
    # Load LinkedIn data
    if linkedin_files:
        linkedin_df = pd.read_csv(max(linkedin_files))
        linkedin_df['platform'] = 'LinkedIn'
        dfs.append(linkedin_df)
    
    # Load manual collection data
    if manual_files:
        manual_df = pd.read_csv(max(manual_files))
        dfs.append(manual_df)
    
    # Combine all data
    if dfs:
        combined_df = pd.concat(dfs, ignore_index=True)
        print(f"Loaded {len(combined_df)} total posts")
        return combined_df
    else:
        print("No data files found")
        return None

# Load the data
df = load_data()


In [None]:
# Analyze engagement patterns across platforms
def analyze_engagement():
    """Compare engagement metrics between platforms"""
    # Calculate engagement metrics
    if df is not None:
        platform_stats = df.groupby('platform').agg({
            'likes': ['mean', 'sum', 'count'],
            'comments': ['mean', 'sum'],
            'shares': ['mean', 'sum']
        }).round(2)
        
        # Create engagement comparison visualization
        fig = go.Figure()
        
        metrics = ['likes', 'comments', 'shares']
        colors = kenya_colors[:3]
        
        for metric, color in zip(metrics, colors):
            fig.add_trace(go.Bar(
                name=metric.capitalize(),
                x=df['platform'].unique(),
                y=platform_stats[metric]['mean'],
                marker_color=color
            ))
        
        fig.update_layout(
            title='Average Engagement by Platform',
            xaxis_title='Platform',
            yaxis_title='Average Engagement',
            barmode='group',
            template='plotly_white'
        )
        
        return platform_stats, fig
    else:
        print("No data available for analysis")
        return None, None

# Generate engagement insights
platform_stats, engagement_fig = analyze_engagement()
if platform_stats is not None:
    print("\nEngagement Statistics by Platform:")
    print(platform_stats)
    engagement_fig.show()
