# YouTube Sri Lankan Content Analysis

This notebook demonstrates exploratory data analysis of YouTube videos from Sri Lankan channels using the collected and processed data.

## Setup and Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

In [None]:
# Load processed data
import sys
import os
sys.path.append('../scripts')

from utils import load_from_csv

# Load the most recent processed data file
processed_files = [f for f in os.listdir('../data/processed') if f.startswith('processed_videos_') and f.endswith('.csv')]

if processed_files:
    latest_file = max(processed_files)
    df = load_from_csv(f'../data/processed/{latest_file}')
    print(f"Loaded {len(df)} videos from {latest_file}")
    print(f"Dataset shape: {df.shape}")
else:
    print("No processed data files found. Please run process_data.py first.")
    # Create sample data for demonstration
    df = pd.DataFrame({
        'video_id': ['sample_1', 'sample_2'],
        'title': ['Sample Video 1', 'Sample Video 2'],
        'view_count': [1000, 2000],
        'like_count': [50, 100],
        'comment_count': [10, 20]
    })

## Data Overview

In [None]:
# Basic information about the dataset
print("Dataset Info:")
print(f"Total videos: {len(df)}")
print(f"Total features: {len(df.columns)}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

if 'published_at' in df.columns:
    df['published_at'] = pd.to_datetime(df['published_at'])
    print(f"Date range: {df['published_at'].min()} to {df['published_at'].max()}")

print("\nColumn types:")
print(df.dtypes.value_counts())

In [None]:
# Display first few rows
df.head()

In [None]:
# Basic statistics for numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols].describe()

## Channel Category Analysis

In [None]:
if 'channel_category' in df.columns:
    # Videos by category
    category_counts = df['channel_category'].value_counts()
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Bar plot
    category_counts.plot(kind='bar', ax=ax1, color='skyblue')
    ax1.set_title('Number of Videos by Category')
    ax1.set_xlabel('Category')
    ax1.set_ylabel('Number of Videos')
    ax1.tick_params(axis='x', rotation=45)
    
    # Pie chart
    ax2.pie(category_counts.values, labels=category_counts.index, autopct='%1.1f%%')
    ax2.set_title('Distribution of Videos by Category')
    
    plt.tight_layout()
    plt.show()
    
    print("Videos by category:")
    for category, count in category_counts.items():
        print(f"  {category}: {count} videos ({count/len(df)*100:.1f}%)")
else:
    print("Channel category information not available")

## Performance Metrics Analysis

In [None]:
# Performance metrics by category
if 'channel_category' in df.columns and all(col in df.columns for col in ['view_count', 'like_count', 'comment_count']):
    performance_by_category = df.groupby('channel_category').agg({
        'view_count': ['mean', 'median', 'std'],
        'like_count': ['mean', 'median'],
        'comment_count': ['mean', 'median'],
        'engagement_ratio': ['mean', 'median'] if 'engagement_ratio' in df.columns else ['count']
    }).round(2)
    
    print("Performance metrics by category:")
    print(performance_by_category)
    
    # Visualize average views by category
    avg_views = df.groupby('channel_category')['view_count'].mean().sort_values(ascending=False)
    
    plt.figure(figsize=(12, 6))
    avg_views.plot(kind='bar', color='lightcoral')
    plt.title('Average Views by Channel Category')
    plt.xlabel('Category')
    plt.ylabel('Average Views')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

## Engagement Analysis

In [None]:
if all(col in df.columns for col in ['view_count', 'like_count', 'comment_count']):
    # Calculate engagement metrics if not already present
    if 'engagement_ratio' not in df.columns:
        df['like_ratio'] = df['like_count'] / (df['view_count'] + 1)
        df['comment_ratio'] = df['comment_count'] / (df['view_count'] + 1)
        df['engagement_ratio'] = (df['like_count'] + df['comment_count']) / (df['view_count'] + 1)
    
    # Engagement distribution
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Views distribution
    axes[0,0].hist(np.log10(df['view_count'] + 1), bins=30, alpha=0.7, color='blue')
    axes[0,0].set_title('Distribution of Views (log scale)')
    axes[0,0].set_xlabel('Log10(Views + 1)')
    axes[0,0].set_ylabel('Frequency')
    
    # Likes vs Views scatter
    axes[0,1].scatter(np.log10(df['view_count'] + 1), np.log10(df['like_count'] + 1), alpha=0.5)
    axes[0,1].set_title('Likes vs Views (log scale)')
    axes[0,1].set_xlabel('Log10(Views + 1)')
    axes[0,1].set_ylabel('Log10(Likes + 1)')
    
    # Engagement ratio distribution
    axes[1,0].hist(df['engagement_ratio'], bins=30, alpha=0.7, color='green')
    axes[1,0].set_title('Distribution of Engagement Ratio')
    axes[1,0].set_xlabel('Engagement Ratio')
    axes[1,0].set_ylabel('Frequency')
    
    # Comments vs Views scatter
    axes[1,1].scatter(np.log10(df['view_count'] + 1), np.log10(df['comment_count'] + 1), alpha=0.5, color='red')
    axes[1,1].set_title('Comments vs Views (log scale)')
    axes[1,1].set_xlabel('Log10(Views + 1)')
    axes[1,1].set_ylabel('Log10(Comments + 1)')
    
    plt.tight_layout()
    plt.show()
    
    # Top performing videos
    print("\nTop 10 videos by views:")
    top_videos = df.nlargest(10, 'view_count')[['title', 'channel_title', 'view_count', 'like_count', 'comment_count']]
    for idx, row in top_videos.iterrows():
        print(f"  {row['title'][:50]}... - {row['view_count']:,} views")

## Temporal Analysis

In [None]:
if 'published_at' in df.columns:
    # Publishing patterns
    df['publish_hour'] = df['published_at'].dt.hour
    df['publish_day'] = df['published_at'].dt.day_name()
    df['publish_month'] = df['published_at'].dt.month_name()
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Publishing by hour
    hourly_counts = df['publish_hour'].value_counts().sort_index()
    axes[0,0].bar(hourly_counts.index, hourly_counts.values, color='lightblue')
    axes[0,0].set_title('Videos Published by Hour of Day')
    axes[0,0].set_xlabel('Hour')
    axes[0,0].set_ylabel('Number of Videos')
    
    # Publishing by day of week
    day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    daily_counts = df['publish_day'].value_counts().reindex(day_order)
    axes[0,1].bar(range(len(daily_counts)), daily_counts.values, color='lightgreen')
    axes[0,1].set_title('Videos Published by Day of Week')
    axes[0,1].set_xlabel('Day')
    axes[0,1].set_ylabel('Number of Videos')
    axes[0,1].set_xticks(range(len(day_order)))
    axes[0,1].set_xticklabels([d[:3] for d in day_order])
    
    # Average views by hour
    hourly_views = df.groupby('publish_hour')['view_count'].mean()
    axes[1,0].plot(hourly_views.index, hourly_views.values, marker='o', color='orange')
    axes[1,0].set_title('Average Views by Publishing Hour')
    axes[1,0].set_xlabel('Hour')
    axes[1,0].set_ylabel('Average Views')
    
    # Average views by day
    daily_views = df.groupby('publish_day')['view_count'].mean().reindex(day_order)
    axes[1,1].bar(range(len(daily_views)), daily_views.values, color='coral')
    axes[1,1].set_title('Average Views by Day of Week')
    axes[1,1].set_xlabel('Day')
    axes[1,1].set_ylabel('Average Views')
    axes[1,1].set_xticks(range(len(day_order)))
    axes[1,1].set_xticklabels([d[:3] for d in day_order])
    
    plt.tight_layout()
    plt.show()
    
    # Best times to publish
    print("\nBest publishing times (by average views):")
    print(f"  Best hour: {hourly_views.idxmax()}:00 ({hourly_views.max():.0f} avg views)")
    print(f"  Best day: {daily_views.idxmax()} ({daily_views.max():.0f} avg views)")

## Content Analysis

In [None]:
if 'title' in df.columns:
    # Title length analysis
    if 'title_length' not in df.columns:
        df['title_length'] = df['title'].str.len()
    
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Title length distribution
    axes[0].hist(df['title_length'], bins=30, alpha=0.7, color='purple')
    axes[0].set_title('Distribution of Title Lengths')
    axes[0].set_xlabel('Title Length (characters)')
    axes[0].set_ylabel('Frequency')
    
    # Title length vs views
    axes[1].scatter(df['title_length'], np.log10(df['view_count'] + 1), alpha=0.5)
    axes[1].set_title('Title Length vs Views')
    axes[1].set_xlabel('Title Length (characters)')
    axes[1].set_ylabel('Log10(Views + 1)')
    
    plt.tight_layout()
    plt.show()
    
    # Optimal title length
    title_length_bins = pd.cut(df['title_length'], bins=5)
    avg_views_by_length = df.groupby(title_length_bins)['view_count'].mean()
    
    print("\nAverage views by title length:")
    for length_range, avg_views in avg_views_by_length.items():
        print(f"  {length_range}: {avg_views:.0f} views")

## Predictive Features Analysis

In [None]:
# Correlation analysis of key features
if len(numeric_cols) > 5:
    # Select key features for correlation
    key_features = ['view_count', 'like_count', 'comment_count', 'duration_seconds']
    if 'engagement_ratio' in df.columns:
        key_features.append('engagement_ratio')
    if 'title_length' in df.columns:
        key_features.append('title_length')
    
    available_features = [f for f in key_features if f in df.columns]
    
    if len(available_features) > 2:
        correlation_matrix = df[available_features].corr()
        
        plt.figure(figsize=(10, 8))
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
        plt.title('Correlation Matrix of Key Features')
        plt.tight_layout()
        plt.show()
        
        print("\nStrongest correlations with view_count:")
        view_correlations = correlation_matrix['view_count'].abs().sort_values(ascending=False)
        for feature, corr in view_correlations.items():
            if feature != 'view_count':
                print(f"  {feature}: {corr:.3f}")

## Summary and Insights

In [None]:
print("=== ANALYSIS SUMMARY ===")
print(f"Dataset contains {len(df)} videos with {len(df.columns)} features")

if 'view_count' in df.columns:
    print(f"\nViewership Statistics:")
    print(f"  Average views: {df['view_count'].mean():.0f}")
    print(f"  Median views: {df['view_count'].median():.0f}")
    print(f"  Most viewed video: {df['view_count'].max():,} views")

if 'channel_category' in df.columns:
    most_common_category = df['channel_category'].mode()[0]
    print(f"\nMost common category: {most_common_category}")
    
    if 'view_count' in df.columns:
        best_category = df.groupby('channel_category')['view_count'].mean().idxmax()
        print(f"Best performing category: {best_category}")

print("\n=== RECOMMENDATIONS ===")
print("1. Focus on high-performing categories for content strategy")
print("2. Optimize publishing times based on temporal analysis")
print("3. Use title length insights for content optimization")
print("4. Monitor engagement ratios for content quality assessment")
print("5. Use correlation insights for predictive modeling")