# Data Exploration for Cross-Modal Audience Intelligence

This notebook explores the multimodal dataset used for audience engagement prediction, including:
- Data loading and inspection
- Exploratory data analysis
- Visual content analysis
- Text content analysis
- Feature correlations and distributions
- Engagement metric analysis

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import torch
from pathlib import Path
import os
from tqdm.notebook import tqdm
import datetime
import re
import warnings

# Import platform components
from data.data_loader import DataLoader
from data.connectors.nielsen_connector import NielsenConnector
from data.connectors.streaming_api import StreamingPlatformConnector
from data.connectors.social_crawler import SocialMediaCrawler
from data.preprocessing.text_preprocessor import TextPreprocessor
from data.preprocessing.image_preprocessor import ImagePreprocessor
from data.preprocessing.feature_engineering import FeatureEngineer

# Set up plotting
plt.style.use('seaborn-whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['font.size'] = 12
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)
torch.manual_seed(42)

## Data Loading

We'll load data from multiple sources, including Nielsen panel data, streaming platform data, and social media data.

In [None]:
# Initialize data connectors
nielsen_connector = NielsenConnector(api_key="YOUR_API_KEY", cache_dir="./data/cache")
netflix_connector = StreamingPlatformConnector("netflix", api_key="YOUR_API_KEY", cache_dir="./data/cache")
social_crawler = SocialMediaCrawler(platforms=["twitter", "instagram"], cache_dir="./data/cache")

# Initialize data loader
data_loader = DataLoader(
    nielsen_connector=nielsen_connector,
    streaming_connectors={"netflix": netflix_connector},
    social_crawler=social_crawler,
    cache_dir="./data/cache"
)

In [None]:
# Load audience data for sample content
audience_data = data_loader.load_audience_data(
    content_ids=["SHOW123", "SHOW456", "SHOW789"],
    content_names=["Sample Show 1", "Sample Show 2", "Sample Show 3"],
    start_date=pd.Timestamp("2023-01-01").date(),
    end_date=pd.Timestamp("2023-01-31").date(),
    metrics=["views", "engagement", "completion_rate"],
    include_social=True,
    use_cache=True
)

# Display data sources
print(f"Data sources: {list(audience_data.keys())}")

In [None]:
# Display Nielsen panel data
nielsen_df = audience_data.get("nielsen")
print(f"Nielsen data shape: {nielsen_df.shape}")
nielsen_df.head()

In [None]:
# Display streaming platform data
streaming_df = audience_data.get("streaming")
print(f"Streaming data shape: {streaming_df.shape}")
streaming_df.head()

In [None]:
# Display social media data
social_df = audience_data.get("social")
print(f"Social media data shape: {social_df.shape}")
social_df.head()

## Data Summary Statistics

Let's examine the basic statistics of our datasets.

In [None]:
# Nielsen data summary
print("Nielsen data summary:")
nielsen_df.describe()

In [None]:
# Streaming data summary
print("Streaming data summary:")
streaming_df.describe()

In [None]:
# Calculate engagement statistics by content
engagement_by_content = streaming_df.groupby('content_id').agg({
    'views': ['mean', 'median', 'min', 'max', 'std'],
    'engagement': ['mean', 'median', 'min', 'max', 'std'],
    'completion_rate': ['mean', 'median', 'min', 'max', 'std']
})

engagement_by_content

## Data Visualization

Let's visualize the data to better understand distributions and relationships.

In [None]:
# Visualize engagement metrics across content
plt.figure(figsize=(16, 6))

plt.subplot(1, 3, 1)
sns.boxplot(x='content_id', y='views', data=streaming_df)
plt.title('Views by Content')
plt.xticks(rotation=45)

plt.subplot(1, 3, 2)
sns.boxplot(x='content_id', y='engagement', data=streaming_df)
plt.title('Engagement by Content')
plt.xticks(rotation=45)

plt.subplot(1, 3, 3)
sns.boxplot(x='content_id', y='completion_rate', data=streaming_df)
plt.title('Completion Rate by Content')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Visualize correlation between metrics
plt.figure(figsize=(12, 10))
correlation_matrix = streaming_df[['views', 'engagement', 'completion_rate']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5, fmt='.2f')
plt.title('Correlation Matrix of Engagement Metrics')
plt.show()

In [None]:
# Visualize social media sentiment over time
if 'post_date' in social_df.columns and 'sentiment' in social_df.columns:
    social_df['post_date'] = pd.to_datetime(social_df['post_date'])
    social_df.set_index('post_date', inplace=True)
    
    # Group by day and platform
    daily_sentiment = social_df.groupby([pd.Grouper(freq='D'), 'platform'])['sentiment'].mean().reset_index()
    
    plt.figure(figsize=(14, 6))
    sns.lineplot(x='post_date', y='sentiment', hue='platform', data=daily_sentiment, marker='o')
    plt.title('Average Sentiment by Platform Over Time')
    plt.ylabel('Sentiment Score')
    plt.xlabel('Date')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

## Visual Content Analysis

Let's load and analyze some sample images from our content.

In [None]:
# Function to load sample images
def load_sample_images(content_ids, image_dir="./data/images"):
    images = {}
    for content_id in content_ids:
        image_path = Path(image_dir) / f"{content_id}.jpg"
        if image_path.exists():
            images[content_id] = Image.open(image_path)
    return images

# Load sample images
sample_images = load_sample_images(["SHOW123", "SHOW456", "SHOW789"])

# Display sample images
if sample_images:
    plt.figure(figsize=(18, 6))
    for i, (content_id, img) in enumerate(sample_images.items()):
        plt.subplot(1, len(sample_images), i+1)
        plt.imshow(img)
        plt.title(f"Content ID: {content_id}")
        plt.axis('off')
    plt.tight_layout()
    plt.show()
else:
    print("No sample images found.")

In [None]:
# Initialize image preprocessor
image_preprocessor = ImagePreprocessor(target_size=(224, 224), normalize=True)

# Preprocess sample images
if sample_images:
    processed_images = {}
    for content_id, img in sample_images.items():
        processed_images[content_id] = image_preprocessor.preprocess(img)
    
    print(f"Processed image tensor shape: {next(iter(processed_images.values())).shape}")

## Text Content Analysis

Let's analyze the text content from descriptions, social media posts, etc.

In [None]:
# Extract text content from social media data
if 'text' in social_df.columns:
    text_content = social_df['text'].dropna().tolist()
    print(f"Number of text samples: {len(text_content)}")
    
    # Display some sample texts
    for i, text in enumerate(text_content[:5]):
        print(f"Sample {i+1}: {text[:100]}..." if len(text) > 100 else f"Sample {i+1}: {text}")

In [None]:
# Initialize text preprocessor
text_preprocessor = TextPreprocessor(remove_stopwords=True, lowercase=True)

# Preprocess sample texts
if 'text' in social_df.columns:
    processed_texts = []
    for text in text_content[:5]:
        clean_text = text_preprocessor.clean_text(text)
        tokens = text_preprocessor.tokenize(clean_text)
        processed_texts.append({
            'original': text,
            'cleaned': clean_text,
            'tokens': tokens
        })
    
    # Display processed texts
    for i, item in enumerate(processed_texts):
        print(f"Sample {i+1}:")
        print(f"Original: {item['original'][:50]}...")
        print(f"Cleaned: {item['cleaned'][:50]}...")
        print(f"Tokens: {', '.join(item['tokens'][:10])}...\n")

In [None]:
# Analyze word frequency in text content
if 'text' in social_df.columns:
    from collections import Counter
    import nltk
    from nltk.corpus import stopwords
    
    # Download NLTK resources if needed
    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('stopwords')
    
    # Process all texts and count word frequency
    all_words = []
    stop_words = set(stopwords.words('english'))
    
    for text in text_content:
        clean_text = text_preprocessor.clean_text(text)
        tokens = text_preprocessor.tokenize(clean_text)
        all_words.extend([token for token in tokens if token not in stop_words])
    
    word_counts = Counter(all_words)
    top_words = word_counts.most_common(20)
    
    # Plot word frequency
    plt.figure(figsize=(14, 6))
    words, counts = zip(*top_words)
    plt.bar(words, counts)
    plt.title('Top 20 Words in Social Media Content')
    plt.xticks(rotation=45, ha='right')
    plt.xlabel('Word')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()

## Feature Engineering

Let's perform feature engineering to prepare data for model training.

In [None]:
# Initialize feature engineer
feature_engineer = FeatureEngineer(scaler="standard")

# Prepare features from streaming data
if not streaming_df.empty:
    # Select relevant columns for features
    feature_cols = ['views', 'completion_rate']
    if 'average_watch_time' in streaming_df.columns:
        feature_cols.append('average_watch_time')
    
    # Add temporal features if date column exists
    if 'date' in streaming_df.columns:
        streaming_df = feature_engineer.add_temporal_features(streaming_df, date_columns=['date'])
        # Add temporal features to our feature columns
        feature_cols.extend(['date_month', 'date_day', 'date_dayofweek'])
    
    # Fit the feature engineer
    feature_engineer.fit(streaming_df[feature_cols], target_col='engagement')
    
    # Transform the features
    feature_matrix = feature_engineer.transform(streaming_df[feature_cols])
    
    print(f"Feature matrix shape: {feature_matrix.shape}")
    
    # Create a DataFrame with the transformed features
    feature_df = pd.DataFrame(
        feature_matrix,
        columns=feature_engineer.get_feature_names(feature_cols)
    )
    
    # Display the engineered features
    feature_df.head()

## Engagement Analysis

Let's analyze patterns in audience engagement.

In [None]:
# Calculate engagement metrics over time
if 'date' in streaming_df.columns and 'engagement' in streaming_df.columns:
    streaming_df['date'] = pd.to_datetime(streaming_df['date'])
    
    # Group by date and content_id
    daily_engagement = streaming_df.groupby([pd.Grouper(key='date', freq='D'), 'content_id'])['engagement'].mean().unstack()
    
    # Plot engagement over time
    plt.figure(figsize=(14, 6))
    daily_engagement.plot(marker='o', ax=plt.gca())
    plt.title('Daily Engagement by Content')
    plt.ylabel('Engagement Score')
    plt.xlabel('Date')
    plt.grid(True, alpha=0.3)
    plt.legend(title='Content ID')
    plt.tight_layout()
    plt.show()

In [None]:
# Analyze correlation between social sentiment and engagement
if 'sentiment' in social_df.columns and 'engagement' in streaming_df.columns:
    # Convert post_date to datetime if needed
    if 'post_date' in social_df.columns and not pd.api.types.is_datetime64_any_dtype(social_df['post_date']):
        social_df['post_date'] = pd.to_datetime(social_df['post_date'])
    
    # Aggregate sentiment by content and date
    if 'content_name' in social_df.columns and 'post_date' in social_df.columns:
        daily_sentiment = social_df.groupby(['content_name', pd.Grouper(key='post_date', freq='D')])['sentiment'].mean().reset_index()
        
        # Map content_name to content_id if needed
        content_mapping = {
            'Sample Show 1': 'SHOW123',
            'Sample Show 2': 'SHOW456',
            'Sample Show 3': 'SHOW789'
        }
        daily_sentiment['content_id'] = daily_sentiment['content_name'].map(content_mapping)
        
        # Merge with streaming data
        if 'date' in streaming_df.columns:
            # Ensure date is datetime
            if not pd.api.types.is_datetime64_any_dtype(streaming_df['date']):
                streaming_df['date'] = pd.to_datetime(streaming_df['date'])
                
            # Aggregate engagement by content and date
            daily_engagement = streaming_df.groupby(['content_id', pd.Grouper(key='date', freq='D')])['engagement'].mean().reset_index()
            
            # Merge datasets
            sentiment_engagement = pd.merge(
                daily_sentiment,
                daily_engagement,
                left_on=['content_id', 'post_date'],
                right_on=['content_id', 'date'],
                how='inner'
            )
            
            # Plot correlation
            plt.figure(figsize=(10, 8))
            sns.scatterplot(x='sentiment', y='engagement', hue='content_id', data=sentiment_engagement)
            plt.title('Correlation between Social Media Sentiment and Engagement')
            plt.xlabel('Sentiment Score')
            plt.ylabel('Engagement Score')
            plt.grid(True, alpha=0.3)
            
            # Add correlation line
            from scipy import stats
            slope, intercept, r_value, p_value, std_err = stats.linregress(sentiment_engagement['sentiment'], sentiment_engagement['engagement'])
            plt.plot(sentiment_engagement['sentiment'], intercept + slope*sentiment_engagement['sentiment'], 'r', alpha=0.7)
            plt.annotate(f'r = {r_value:.2f}, p = {p_value:.4f}', xy=(0.05, 0.95), xycoords='axes fraction')
            
            plt.tight_layout()
            plt.show()

## Conclusion

In this notebook, we explored multimodal data related to audience engagement, including:
- Basic data loading and inspection
- Distribution and correlation of engagement metrics
- Analysis of visual content
- Analysis of text content
- Feature engineering for model training
- Relationship between social sentiment and engagement

These insights will be used to inform model training and causal analysis in subsequent notebooks.