# Exploratory Data Analysis

This notebook performs exploratory data analysis on cleaned social media posts to identify engagement patterns.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [None]:
# Load cleaned data
df = pd.read_csv('../data/cleaned_posts.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['day'] = pd.to_datetime(df['day'])
print(f"Loaded {len(df)} rows of cleaned data")
df.head()

In [None]:
# 1. Engagement distribution
plt.figure(figsize=(10, 6))
plt.hist(df['engagement_rate'], bins=20, edgecolor='black')
plt.title('Distribution of Engagement Rates')
plt.xlabel('Engagement Rate')
plt.ylabel('Frequency')
plt.show()

In [None]:
# 2. Engagement by hour
hourly_engagement = df.groupby('hour')['engagement_rate'].mean().reset_index()
plt.figure(figsize=(12, 6))
sns.barplot(data=hourly_engagement, x='hour', y='engagement_rate')
plt.title('Average Engagement Rate by Hour of Day')
plt.xlabel('Hour of Day')
plt.ylabel('Average Engagement Rate')
plt.show()

In [None]:
# 3. Engagement by weekday
weekday_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
weekday_engagement = df.groupby('weekday')['engagement_rate'].mean().reset_index()
weekday_engagement['weekday_name'] = [weekday_names[i] for i in weekday_engagement['weekday']]

plt.figure(figsize=(12, 6))
sns.barplot(data=weekday_engagement, x='weekday_name', y='engagement_rate')
plt.title('Average Engagement Rate by Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Average Engagement Rate')
plt.xticks(rotation=45)
plt.show()

In [None]:
# 4. Engagement by platform
platform_engagement = df.groupby('platform')['engagement_rate'].mean().reset_index()
plt.figure(figsize=(10, 6))
sns.barplot(data=platform_engagement, x='platform', y='engagement_rate')
plt.title('Average Engagement Rate by Platform')
plt.xlabel('Platform')
plt.ylabel('Average Engagement Rate')
plt.show()

In [None]:
# 5. Engagement by post type
post_type_engagement = df.groupby('post_type')['engagement_rate'].mean().reset_index()
plt.figure(figsize=(10, 6))
sns.barplot(data=post_type_engagement, x='post_type', y='engagement_rate')
plt.title('Average Engagement Rate by Post Type')
plt.xlabel('Post Type')
plt.ylabel('Average Engagement Rate')
plt.show()

In [None]:
# 6. Caption length vs engagement
plt.figure(figsize=(10, 6))
plt.scatter(df['caption_length'], df['engagement_rate'], alpha=0.7)
plt.title('Caption Length vs Engagement Rate')
plt.xlabel('Caption Length (characters)')
plt.ylabel('Engagement Rate')
plt.show()

In [None]:
# 7. Correlation matrix
numeric_cols = ['likes', 'comments', 'shares', 'reach', 'interactions', 'engagement_rate', 'hour', 'caption_length']
corr_matrix = df[numeric_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Numeric Variables')
plt.show()

In [None]:
# 8. Time series of engagement
daily_engagement = df.groupby('day')['engagement_rate'].mean().reset_index()
plt.figure(figsize=(12, 6))
plt.plot(daily_engagement['day'], daily_engagement['engagement_rate'], marker='o')
plt.title('Average Daily Engagement Rate Over Time')
plt.xlabel('Date')
plt.ylabel('Average Engagement Rate')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Key metrics summary
print("Key Metrics Summary:")
print(f"- Overall average engagement rate: {df['engagement_rate'].mean():.4f}")
print(f"- Best performing hour: {hourly_engagement.loc[hourly_engagement['engagement_rate'].idxmax(), 'hour']}:00")
print(f"- Best performing weekday: {weekday_names[weekday_engagement.loc[weekday_engagement['engagement_rate'].idxmax(), 'weekday']]}")
print(f"- Best performing platform: {platform_engagement.loc[platform_engagement['engagement_rate'].idxmax(), 'platform']}")
print(f"- Best performing post type: {post_type_engagement.loc[post_type_engagement['engagement_rate'].idxmax(), 'post_type']}")