In [None]:
# Cell 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print(" Libraries imported successfully")

✅ Libraries imported successfully


In [None]:
# Load datasets
videos = pd.read_csv("shortform_videos.csv")
creators = pd.read_csv("shortform_creators.csv")
platforms = pd.read_csv("shortform_platforms.csv")

print(f" Loaded {len(videos)} videos, {len(creators)} creators, {len(platforms)} platforms")

# Data preprocessing and feature engineering
merged = videos.merge(creators, on='creator_id', how='left')

# Calculate additional metrics
merged['retention_rate'] = merged['full_views'] / merged['views']
merged['engagement_rate'] = ((merged['likes'] + merged['comments'] + merged['shares']) / merged['views']) * 100
merged['avg_watch_time_per_view'] = merged['watch_time'] / merged['views']
merged['like_to_view_ratio'] = merged['likes'] / merged['views']
merged['share_to_view_ratio'] = merged['shares'] / merged['views']

# Create duration bins
merged['duration_bin'] = pd.cut(
    merged['duration_sec'], 
    bins=[0, 15, 30, 45, 60, 100], 
    labels=['0-15s', '15-30s', '30-45s', '45-60s', '60s+']
)

print(" Data preprocessing completed")
merged.head()

📊 Loaded 1000 videos, 50 creators, 5 platforms
✅ Data preprocessing completed


Unnamed: 0,video_id,creator_id,format_type,duration_sec,views,likes,comments,shares,watch_time,full_views,...,platform,creator_name,niche,followers,retention_rate,engagement_rate,avg_watch_time_per_view,like_to_view_ratio,share_to_view_ratio,duration_bin
0,v1,c39,meme,10,7693,4588,104,499,105186,3458,...,YouTube Shorts,Creator 39,Tech,183058,0.4495,67.476927,13.672949,0.596386,0.064864,0-15s
1,v2,c29,meme,28,2613,4399,692,92,166707,5011,...,Facebook Reels,Creator 29,Beauty,368029,1.917719,198.354382,63.799082,1.683506,0.035209,15-30s
2,v3,c15,reaction,40,16780,570,555,684,270609,6897,...,Facebook Reels,Creator 15,Tech,393573,0.411025,10.780691,16.126877,0.033969,0.040763,30-45s
3,v4,c43,reaction,25,14065,1026,281,78,57754,15975,...,YouTube Shorts,Creator 43,Tech,49825,1.135798,9.847138,4.106221,0.072947,0.005546,15-30s
4,v5,c8,duet,23,10683,3772,686,621,230047,11239,...,Facebook Reels,Creator 8,Tech,394086,1.052045,47.542825,21.533932,0.353084,0.05813,15-30s


In [None]:
# Format performance analysis
format_stats = merged.groupby('format_type').agg({
    'views': ['mean', 'std'],
    'likes': ['mean', 'std'],
    'comments': ['mean', 'std'],
    'shares': ['mean', 'std'],
    'retention_rate': ['mean', 'std'],
    'engagement_rate': ['mean', 'std'],
    'hook_watch_rate': ['mean', 'std']
}).round(3)

print(" Format Performance Summary:")
format_stats

📊 Format Performance Summary:


Unnamed: 0_level_0,views,views,likes,likes,comments,comments,shares,shares,retention_rate,retention_rate,engagement_rate,engagement_rate,hook_watch_rate,hook_watch_rate
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
format_type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
duet,10375.948,5473.994,2527.938,1478.998,516.255,294.511,495.641,303.929,1.497,2.209,57.444,68.789,0.712,0.17
meme,10540.789,5809.654,2763.76,1350.493,517.833,287.899,482.299,280.933,1.668,2.708,66.908,88.983,0.693,0.17
reaction,10544.645,5465.332,2501.729,1375.689,538.98,294.214,465.754,285.295,1.652,2.623,61.846,100.868,0.684,0.182
story,9770.779,5561.246,2516.789,1460.428,509.08,295.654,505.668,275.263,1.793,2.746,79.51,126.342,0.695,0.173
tutorial,10422.535,5570.043,2495.762,1393.486,506.292,285.706,496.054,278.093,1.613,2.326,62.665,98.7,0.707,0.179


In [None]:
# Creator performance ranking
creator_stats = merged.groupby(['creator_name', 'niche']).agg({
    'views': 'mean',
    'likes': 'mean',
    'shares': 'mean',
    'retention_rate': 'mean',
    'engagement_rate': 'mean',
    'hook_watch_rate': 'mean'
}).round(3)

# Add follower count
creator_stats = creator_stats.merge(
    creators[['creator_name', 'followers']], 
    on='creator_name', 
    how='left'
)

top_creators = creator_stats.sort_values('retention_rate', ascending=False).head(15)
print(" Top 15 Creators by Retention Rate:")
top_creators

🏆 Top 15 Creators by Retention Rate:


Unnamed: 0,creator_name,views,likes,shares,retention_rate,engagement_rate,hook_watch_rate,followers
24,Creator 31,8601.923,2504.846,391.538,3.435,116.977,0.739,489783
19,Creator 27,6812.438,2840.438,584.188,3.391,112.483,0.698,373428
7,Creator 16,8082.476,2457.095,515.429,3.052,89.78,0.625,198472
20,Creator 28,11279.483,2605.172,453.931,2.66,121.19,0.712,118399
6,Creator 15,9996.688,2462.25,551.312,2.42,87.648,0.724,393573
38,Creator 44,8004.857,2753.381,444.667,2.254,86.435,0.744,14877
48,Creator 8,7077.727,2388.773,513.591,2.216,71.963,0.712,394086
12,Creator 20,11312.0,2724.0,408.867,2.178,81.926,0.71,45484
21,Creator 29,9388.947,3209.263,446.947,2.143,118.772,0.655,368029
30,Creator 37,9346.04,2516.6,618.56,2.098,79.12,0.646,100250


In [None]:
# Niche performance analysis
niche_stats = merged.groupby('niche').agg({
    'views': ['mean', 'count'],
    'likes': 'mean',
    'shares': 'mean',
    'retention_rate': 'mean',
    'engagement_rate': 'mean',
    'hook_watch_rate': 'mean'
}).round(3)

print(" Niche Performance Summary:")
niche_stats

🎯 Niche Performance Summary:


Unnamed: 0_level_0,views,views,likes,shares,retention_rate,engagement_rate,hook_watch_rate
Unnamed: 0_level_1,mean,count,mean,mean,mean,mean,mean
niche,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Beauty,10043.704,216,2464.602,487.463,1.571,67.828,0.672
Comedy,10611.854,185,2537.897,497.968,1.68,65.549,0.702
Fitness,11318.622,90,2505.611,474.433,1.842,76.901,0.713
Storytelling,10230.843,172,2656.488,471.953,1.683,65.821,0.706
Tech,10153.677,337,2604.513,497.448,1.604,61.407,0.705


In [None]:
# Duration performance analysis
duration_stats = merged.groupby('duration_bin').agg({
    'views': 'mean',
    'retention_rate': 'mean',
    'engagement_rate': 'mean',
    'hook_watch_rate': 'mean'
}).round(3)

print(" Duration Performance Summary:")
duration_stats

⏱️ Duration Performance Summary:


Unnamed: 0_level_0,views,retention_rate,engagement_rate,hook_watch_rate
duration_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0-15s,11045.267,1.453,54.537,0.694
15-30s,10348.83,1.577,60.997,0.716
30-45s,10124.922,1.817,75.51,0.675
45-60s,10226.769,1.623,65.379,0.705
60s+,,,,


In [None]:
# Correlation analysis
numeric_cols = [
    'views', 'likes', 'comments', 'shares', 'watch_time', 
    'full_views', 'retention_rate', 'hook_watch_rate',
    'engagement_rate', 'avg_watch_time_per_view',
    'like_to_view_ratio', 'share_to_view_ratio'
]

correlation_matrix = merged[numeric_cols].corr()

# Show top correlations
correlations = correlation_matrix.unstack().sort_values(ascending=False)
print(" Top 10 Metric Correlations:")
correlations.head(10)

🔗 Top 10 Metric Correlations:


views                    views                      1.0
likes                    likes                      1.0
like_to_view_ratio       like_to_view_ratio         1.0
retention_rate           retention_rate             1.0
watch_time               watch_time                 1.0
full_views               full_views                 1.0
share_to_view_ratio      share_to_view_ratio        1.0
avg_watch_time_per_view  avg_watch_time_per_view    1.0
hook_watch_rate          hook_watch_rate            1.0
engagement_rate          engagement_rate            1.0
dtype: float64

In [None]:
# Performance clustering
features = ['views', 'likes', 'shares', 'retention_rate', 'engagement_rate']

# Scale features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(merged[features])

# Perform clustering
kmeans = KMeans(n_clusters=4, random_state=42)
merged['cluster'] = kmeans.fit_predict(scaled_features)

# Analyze clusters
cluster_analysis = merged.groupby('cluster')[features].mean().round(3)
cluster_analysis['count'] = merged.groupby('cluster').size()

print(" Performance Clusters:")
cluster_analysis

🎯 Performance Clusters:


Unnamed: 0_level_0,views,likes,shares,retention_rate,engagement_rate,count
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,13331.079,1396.676,372.785,0.744,18.732,340
1,1142.233,3067.163,451.93,10.846,407.855,43
2,4184.537,2276.933,582.176,2.62,104.374,255
3,12939.401,3797.323,536.729,0.714,41.966,362


In [None]:
# Generate key insights
print(" KEY INSIGHTS FROM ANALYSIS:")
print("=" * 50)

# Format insights
best_format = merged.groupby('format_type')['retention_rate'].mean().idxmax()
print(f" Best Performing Format: {best_format} has the highest average retention rate")

# Creator insights - SIMPLER APPROACH
top_creators_sorted = creator_stats.sort_values('retention_rate', ascending=False).reset_index()
if len(top_creators_sorted) > 0:
    top_creator = top_creators_sorted.iloc[0]['creator_name']
    print(f"�� Top Creator: {top_creator} leads in retention rate")
else:
    print(" Top Creator: Analysis available in detailed report")

# Niche insights
best_niche = merged.groupby('niche')['engagement_rate'].mean().idxmax()
print(f" Most Engaging Niche: {best_niche} generates highest engagement rates")

# Duration insights
optimal_duration = merged.groupby('duration_bin')['retention_rate'].mean().idxmax()
print(f"⏱ Optimal Duration: {optimal_duration} videos perform best")

# Hook insights
avg_hook = merged['hook_watch_rate'].mean()
print(f" Average Hook Watch Rate: {avg_hook:.3f}")

# Overall performance
avg_retention = merged['retention_rate'].mean()
avg_engagement = merged['engagement_rate'].mean()
print(f" Overall Average Retention: {avg_retention:.3f}")
print(f" Overall Average Engagement: {avg_engagement:.3f}%")

💡 KEY INSIGHTS FROM ANALYSIS:
📊 Best Performing Format: story has the highest average retention rate
�� Top Creator: Creator 31 leads in retention rate
🎯 Most Engaging Niche: Fitness generates highest engagement rates
⏱️ Optimal Duration: 30-45s videos perform best
�� Average Hook Watch Rate: 0.698
📈 Overall Average Retention: 1.646
📈 Overall Average Engagement: 65.714%
