In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import re

# Set style for visualizations
plt.style.use('seaborn')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['font.size'] = 12

# Download required NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Custom color palette for Kenya-themed visualizations
kenya_colors = ['#BE0027', '#000000', '#169B62', '#FFFFFF']  # Colors from Kenyan flag


In [None]:
# Load and preprocess the data
def load_latest_data():
    """Load the most recent data file from the data directory"""
    import glob
    import os
    
    # Find the latest LinkedIn data file
    files = glob.glob('../data/linkedin_posts_*.csv')
    if not files:
        raise FileNotFoundError("No LinkedIn data files found")
    
    latest_file = max(files, key=os.path.getctime)
    df = pd.read_csv(latest_file)
    
    # Convert date column
    df['post_date'] = pd.to_datetime(df['timestamp'])
    
    # Clean text data
    df['clean_content'] = df['content'].apply(clean_text)
    
    # Extract company size categories
    df['company_size_cat'] = pd.Categorical(df['company_size'], 
                                          categories=['Small (<50)', 'Medium (50-500)', 'Large (>500)'],
                                          ordered=True)
    
    return df

def clean_text(text):
    """Clean and preprocess text data"""
    if pd.isna(text):
        return ""
    
    # Convert to lowercase
    text = str(text).lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    # Remove special characters and numbers
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

# Load the data
try:
    df = load_latest_data()
    print(f"Loaded {len(df)} posts")
    print("\nData Overview:")
    print(df.info())
except FileNotFoundError as e:
    print("Warning: Data files not found. Please run the scraper first.")


In [None]:
# Analyze organization engagement
def analyze_top_organizations():
    # Calculate engagement metrics
    df['engagement_score'] = df['likes'] + df['comments'] * 2 + df['shares'] * 3
    
    # Aggregate by company
    company_stats = df.groupby('company').agg({
        'engagement_score': ['sum', 'mean'],
        'post_date': 'count',
        'likes': 'sum',
        'comments': 'sum',
        'shares': 'sum'
    }).round(2)
    
    company_stats.columns = ['total_engagement', 'avg_engagement', 'post_count', 
                           'total_likes', 'total_comments', 'total_shares']
    
    # Sort by total engagement
    company_stats = company_stats.sort_values('total_engagement', ascending=False)
    
    return company_stats

# Create visualization of top organizations
def plot_top_organizations(stats, top_n=10):
    # Prepare data for plotting
    plot_data = stats.head(top_n)
    
    # Create interactive bar chart using Plotly
    fig = go.Figure()
    
    # Add bars for different metrics
    fig.add_trace(go.Bar(
        x=plot_data.index,
        y=plot_data['total_likes'],
        name='Likes',
        marker_color=kenya_colors[0]
    ))
    
    fig.add_trace(go.Bar(
        x=plot_data.index,
        y=plot_data['total_comments'],
        name='Comments',
        marker_color=kenya_colors[2]
    ))
    
    fig.add_trace(go.Bar(
        x=plot_data.index,
        y=plot_data['total_shares'],
        name='Shares',
        marker_color=kenya_colors[1]
    ))
    
    # Update layout
    fig.update_layout(
        title='Top Organizations Leading AI Conversations in Kenya',
        xaxis_title='Organization',
        yaxis_title='Engagement Metrics',
        barmode='stack',
        showlegend=True,
        template='plotly_white'
    )
    
    return fig

# Generate insights
org_stats = analyze_top_organizations()
print("Top 5 Organizations by Engagement:")
print(org_stats[['total_engagement', 'post_count', 'avg_engagement']].head().to_string())

# Create and display visualization
fig = plot_top_organizations(org_stats)
fig.show()


In [None]:
# Analyze topics and create trend visualization
def analyze_topics():
    # Create custom stopwords
    custom_stopwords = set(stopwords.words('english') + ['ai', 'artificial', 'intelligence', 'kenya', 'kenyan'])
    
    # Extract and count keywords
    all_words = ' '.join(df['clean_content'].dropna())
    words = word_tokenize(all_words)
    words = [word for word in words if word.lower() not in custom_stopwords and len(word) > 2]
    
    # Count word frequencies
    word_freq = Counter(words)
    top_words = dict(sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:50])
    
    return top_words

# Generate word cloud visualization
def create_topic_wordcloud(word_freq):
    wordcloud = WordCloud(
        width=1200, 
        height=800,
        background_color='white',
        colormap='viridis',
        max_words=100
    ).generate_from_frequencies(word_freq)
    
    plt.figure(figsize=(15, 10))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Most Discussed AI Topics in Kenya', pad=20, size=16)
    plt.tight_layout(pad=0)
    
    return plt

# Plot monthly trends
def plot_topic_trends():
    df['month'] = df['post_date'].dt.to_period('M')
    monthly_posts = df.groupby('month').size()
    
    fig = go.Figure()
    
    fig.add_trace(go.Scatter(
        x=monthly_posts.index.astype(str),
        y=monthly_posts.values,
        mode='lines+markers',
        name='Posts',
        line=dict(color=kenya_colors[2], width=3),
        marker=dict(size=8)
    ))
    
    fig.update_layout(
        title='AI Discussion Trends Over Time',
        xaxis_title='Month',
        yaxis_title='Number of Posts',
        template='plotly_white',
        showlegend=False
    )
    
    return fig

# Generate insights and visualizations
word_freq = analyze_topics()
print("Top 10 Most Discussed Topics:")
for word, freq in list(word_freq.items())[:10]:
    print(f"{word}: {freq}")

# Display visualizations
create_topic_wordcloud(word_freq)
plt.show()

trend_fig = plot_topic_trends()
trend_fig.show()


In [None]:
# Analyze seniority distribution and impact
def analyze_seniority():
    # Calculate engagement by seniority
    seniority_stats = df.groupby('seniority_level').agg({
        'engagement_score': ['mean', 'sum'],
        'post_date': 'count'
    }).round(2)
    
    seniority_stats.columns = ['avg_engagement', 'total_engagement', 'post_count']
    return seniority_stats

def plot_seniority_insights():
    # Create sunburst chart for seniority and company size
    sunburst_data = df.groupby(['seniority_level', 'company_size_cat'])['engagement_score'].sum().reset_index()
    
    fig = px.sunburst(
        sunburst_data,
        path=['seniority_level', 'company_size_cat'],
        values='engagement_score',
        color='seniority_level',
        color_discrete_sequence=px.colors.qualitative.Set3,
        title='AI Discussion Distribution by Seniority and Company Size'
    )
    
    fig.update_layout(
        width=800,
        height=800,
        template='plotly_white'
    )
    
    return fig

def plot_engagement_by_seniority():
    # Calculate average engagement metrics by seniority
    engagement_metrics = df.groupby('seniority_level').agg({
        'likes': 'mean',
        'comments': 'mean',
        'shares': 'mean'
    }).round(2)
    
    # Create grouped bar chart
    fig = go.Figure()
    
    metrics = ['likes', 'comments', 'shares']
    colors = kenya_colors[:3]
    
    for metric, color in zip(metrics, colors):
        fig.add_trace(go.Bar(
            name=metric.capitalize(),
            x=engagement_metrics.index,
            y=engagement_metrics[metric],
            marker_color=color
        ))
    
    fig.update_layout(
        title='Average Engagement by Seniority Level',
        xaxis_title='Seniority Level',
        yaxis_title='Average Engagement',
        barmode='group',
        template='plotly_white'
    )
    
    return fig

# Generate insights
seniority_stats = analyze_seniority()
print("Engagement by Seniority Level:")
print(seniority_stats)

# Create and display visualizations
sunburst_fig = plot_seniority_insights()
sunburst_fig.show()

engagement_fig = plot_engagement_by_seniority()
engagement_fig.show()


In [None]:
# Generate key insights for LinkedIn post
def generate_linkedin_insights():
    insights = {
        'total_posts': len(df),
        'total_companies': df['company'].nunique(),
        'top_company': org_stats.index[0],
        'top_company_engagement': org_stats['total_engagement'].iloc[0],
        'most_active_level': seniority_stats['post_count'].idxmax(),
        'highest_engaging_level': seniority_stats['avg_engagement'].idxmax(),
        'top_topics': list(word_freq.items())[:5]
    }
    
    # Calculate month-over-month growth
    monthly_counts = df.groupby(df['post_date'].dt.to_period('M')).size()
    if len(monthly_counts) >= 2:
        mom_growth = ((monthly_counts.iloc[-1] - monthly_counts.iloc[-2]) / monthly_counts.iloc[-2] * 100).round(1)
        insights['mom_growth'] = mom_growth
    
    return insights

# Format insights for LinkedIn post
def format_linkedin_post(insights):
    post = f"""🔍 Who's Really Talking About AI in Kenya? Here's What I Found:

After analyzing {insights['total_posts']} posts from {insights['total_companies']} leading organizations in Kenya, here are the key insights about AI and digital transformation discussions:

🏢 Top Organizations:
• {insights['top_company']} leads the conversation with highest engagement
• {insights['most_active_level']}s are the most active in sharing AI insights
• {insights['highest_engaging_level']}s generate the most engagement per post

🔥 Hot Topics:
"""
    
    # Add top topics
    for topic, freq in insights['top_topics']:
        post += f"• {topic}\n"
    
    if 'mom_growth' in insights:
        post += f"\n📈 The conversation is growing! {insights['mom_growth']}% increase in AI-related posts month-over-month.\n"
    
    post += """
💡 What This Means:
The data shows Kenya's business landscape is actively embracing AI and digital transformation, with leaders across different organizational levels contributing to the conversation.

#AIinKenya #DigitalTransformation #AfricanTech #Innovation #DataAnalysis"""
    
    return post

# Generate and display LinkedIn post
insights = generate_linkedin_insights()
linkedin_post = format_linkedin_post(insights)
print("Suggested LinkedIn Post:")
print("-" * 80)
print(linkedin_post)
print("-" * 80)
