In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download required NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Set style for visualizations
plt.style.use('seaborn')
sns.set_palette("husl")


In [None]:
# Load the data
twitter_df = pd.read_csv('../data/twitter_data.csv')
linkedin_df = pd.read_csv('../data/linkedin_data.csv')

print("Twitter data shape:", twitter_df.shape)
print("LinkedIn data shape:", linkedin_df.shape)


In [None]:
# Analyze Twitter user influence
def analyze_twitter_influence():
    # Calculate engagement rate
    twitter_df['engagement_rate'] = (twitter_df['retweet_count'] + twitter_df['favorite_count']) / twitter_df['user_followers']
    
    # Plot distribution of follower counts
    plt.figure(figsize=(10, 6))
    sns.histplot(data=twitter_df, x='user_followers', bins=30)
    plt.title('Distribution of Twitter Follower Counts')
    plt.xlabel('Number of Followers')
    plt.ylabel('Count')
    plt.yscale('log')
    plt.xscale('log')
    plt.savefig('../visuals/twitter_followers_dist.png')
    plt.close()
    
    # Plot engagement rate vs follower count
    plt.figure(figsize=(10, 6))
    plt.scatter(twitter_df['user_followers'], twitter_df['engagement_rate'])
    plt.title('Engagement Rate vs Follower Count')
    plt.xlabel('Number of Followers')
    plt.ylabel('Engagement Rate')
    plt.xscale('log')
    plt.yscale('log')
    plt.savefig('../visuals/twitter_engagement.png')
    plt.close()

analyze_twitter_influence()


In [None]:
# Analyze LinkedIn seniority and company size distribution
def analyze_linkedin_demographics():
    # Plot seniority level distribution
    plt.figure(figsize=(10, 6))
    sns.countplot(data=linkedin_df, y='seniority_level', order=linkedin_df['seniority_level'].value_counts().index)
    plt.title('Distribution of Posts by Seniority Level')
    plt.xlabel('Number of Posts')
    plt.ylabel('Seniority Level')
    plt.savefig('../visuals/linkedin_seniority.png')
    plt.close()
    
    # Plot company size distribution
    plt.figure(figsize=(10, 6))
    sns.countplot(data=linkedin_df, y='company_size', order=linkedin_df['company_size'].value_counts().index)
    plt.title('Distribution of Posts by Company Size')
    plt.xlabel('Number of Posts')
    plt.ylabel('Company Size')
    plt.savefig('../visuals/linkedin_company_size.png')
    plt.close()
    
    # Calculate average engagement by seniority
    engagement_by_seniority = linkedin_df.groupby('seniority_level')[['likes', 'comments', 'shares']].mean()
    print("\nAverage engagement by seniority level:")
    print(engagement_by_seniority)

analyze_linkedin_demographics()


In [None]:
# Content analysis functions
def preprocess_text(text):
    # Tokenize
    tokens = word_tokenize(str(text).lower())
    
    # Remove stopwords and common Twitter/LinkedIn terms
    stop_words = set(stopwords.words('english'))
    stop_words.update(['https', 'co', 'rt', 'amp', 'via'])
    
    return ' '.join([word for word in tokens if word.isalnum() and word not in stop_words])

def generate_wordcloud(text, title, filename):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.savefig(f'../visuals/{filename}.png')
    plt.close()

# Process Twitter content
twitter_text = ' '.join(twitter_df['text'].apply(preprocess_text))
generate_wordcloud(twitter_text, 'Common Terms in Twitter Posts', 'twitter_wordcloud')

# Process LinkedIn content
linkedin_text = ' '.join(linkedin_df['post_text'].apply(preprocess_text))
generate_wordcloud(linkedin_text, 'Common Terms in LinkedIn Posts', 'linkedin_wordcloud')
