# Stream Analysis Flow
This notebook processes YouTube transcripts and live chat logs to analyze engagement, spikes, and keywords.

## 1. Workspace Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import find_peaks
import re
from collections import Counter
import nltk
from nltk.corpus import stopwords
import textwrap

# Custom module
import parsers
import youtube_client # Ensure this file exists in your directory

# Pre-download NLTK data
nltk.download("stopwords", quiet=True)
stops = set(stopwords.words("english"))

# Chart styling
plt.style.use('seaborn-v0_8-darkgrid')

In [None]:
# --- HELPER FUNCTIONS ---
def extract_transcript_by_minutes(transcript_df, minutes):
    """
    Retrieves subtitle text for specific minutes.
    
    Args:
        transcript_df (pd.DataFrame): The dataframe containing transcript data.
        minutes (list or int): A single minute (int) or list of minutes to retrieve.
        
    Returns:
        dict: A dictionary where key is the minute (int) and value is the text (str).
    """
    if isinstance(minutes, (int, float)):
        minutes = [int(minutes)]
        
    results = {}
    for m in minutes:
        start_sec = m * 60
        end_sec = (m + 1) * 60
        
        # Filter rows that start within this minute
        mask = (transcript_df['offset_start_seconds'] >= start_sec) & \
               (transcript_df['offset_start_seconds'] < end_sec)
               
        segment = transcript_df[mask]
        if not segment.empty:
            # Join text and clean up extra spaces
            text = " ".join(segment['text'].tolist())
            results[m] = text.strip()
        else:
            results[m] = "(No speech detected)"
            
    return results

In [None]:
# Configuration
YOUTUBE_URL = "https://www.youtube.com/watch?v=p8sR5q7OGBk"
YT_ID = YOUTUBE_URL.split("=")[-1].split("?")[0]

print(f"Targeting Video ID: {YT_ID}")

## 2. Download & Parse Data

In [None]:
# Download source materials
transcript_filepath = youtube_client.download_transcript(YT_ID)
chat_filepath = youtube_client.download_live_chat(YT_ID)

# Parse DataFrames
parsed_transcript_df = pd.DataFrame()
parsed_chat_df = pd.DataFrame()

if transcript_filepath:
    parsed_transcript_df = parsers.parse_transcript_vtt(transcript_filepath)
else:
    print(f"No transcript available for: {YT_ID}")

if chat_filepath:
    parsed_chat_df = parsers.parse_live_chat_json(chat_filepath)
else:
    print(f"No live chat available for: {YT_ID}")

# Display Info
print("\n--- Transcript Info ---")
parsed_transcript_df.info()
print("\n--- Chat Info ---")
parsed_chat_df.info()

In [None]:
replace_dict = [
    ("Faze", "Phase"),
    ("&gt;&gt; ", ""),
    ("Cleo", "Clio"),
]

for original, replace in replace_dict:
    parsed_transcript_df['text'] = parsed_transcript_df['text'].str.replace(original, replace)

In [None]:
start_time = (1, 14, 0) # in h, m
end_time = (3, 40, 0)
# change to seconds, and reshape the dfs to only preserve the rows within the minutes
start_time_seconds = start_time[0]*60*60 + start_time[1]*60 + end_time[2]
end_time_seconds = end_time[0]*60*60 + end_time[1]*60 + end_time[2]
start_time_minute = start_time[0]*60 + start_time[1]
end_time_minute = end_time[0]*60 + end_time[1]

# TODO: reshape dfs
transcript_mask = (parsed_transcript_df['offset_start_seconds'] >= start_time_seconds) & \
        (parsed_transcript_df['offset_start_seconds'] < end_time_seconds)
parsed_transcript_df = parsed_transcript_df[transcript_mask]

chat_mask = (parsed_chat_df['minute'] >= start_time_minute) & \
        (parsed_chat_df['minute'] < end_time_minute)
parsed_chat_df = parsed_chat_df[chat_mask]

## 3. General Activity Analysis
Identify moments where chat volume spikes significantly.

In [None]:
if not parsed_chat_df.empty:
    # Aggregate by minute
    messages_per_minute = parsed_chat_df.groupby("minute").size().rename("message_count").reset_index()
    
    # Peak Detection
    counts = messages_per_minute["message_count"].values
    # Peak threshold: Mean + 2 Standard Deviations
    threshold = counts.mean() + (2 * counts.std())
    peaks, _ = find_peaks(counts, height=threshold)
    peak_minutes = messages_per_minute.loc[peaks, "minute"].tolist()

    # Plotting
    plt.figure(figsize=(15, 6))
    bars = plt.bar(messages_per_minute['minute'], messages_per_minute['message_count'], 
                   color='skyblue', label='Messages/Min')

    # Highlight spikes
    for minute in peak_minutes:
        idx = messages_per_minute[messages_per_minute['minute'] == minute].index
        if not idx.empty:
            bars[idx[0]].set_color('salmon')

    plt.title(f'Chat Volume per Minute (Spikes > {int(threshold)} msgs)', fontsize=14)
    plt.xlabel('Minute Offset')
    plt.ylabel('Message Count')
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    # Show context for peaks using the new helper function
    if not parsed_transcript_df.empty and peak_minutes:
        print(f"\n--- CONTEXT FOR HIGH TRAFFIC MOMENTS (>{int(threshold)} msgs) ---")
        transcript_context = extract_transcript_by_minutes(parsed_transcript_df, peak_minutes)
        
        for m in sorted(peak_minutes):
            print(f"\n[Minute {m}] {messages_per_minute.loc[messages_per_minute['minute'] == m, 'message_count'].values[0]} msgs")
            print(f"{textwrap.fill(transcript_context[m], width=80, initial_indent='  ', subsequent_indent='  ')}")
else:
    print("Skipping analysis: No chat data.")

## 4. User Analysis

In [None]:
if not parsed_chat_df.empty:
    # Calculate Counts
    user_counts = parsed_chat_df["author_name"].value_counts().reset_index()
    user_counts.columns = ["author_name", "msg_count"]

    # --- Gini & Lorenz Curve ---
    counts = user_counts["msg_count"].values
    counts_sorted = np.sort(counts)
    n = len(counts_sorted)
    
    # Lorenz calc
    cum_counts = np.cumsum(counts_sorted)
    normalized_cum_counts = cum_counts / cum_counts[-1]
    
    # Gini calc
    # Area under Lorenz curve is sum(cum_counts) / total_sum / n roughly, 
    # but standard formula is (2 * Area_between_line_equality_and_lorenz)
    gini = (n + 1 - 2 * np.sum(cum_counts) / cum_counts[-1]) / n

    x_axis = np.linspace(0, 1, len(normalized_cum_counts))

    plt.figure(figsize=(6, 6))
    plt.plot(x_axis, normalized_cum_counts, label=f'Gini: {gini:.3f}', linewidth=2)
    plt.plot([0, 1], [0, 1], 'k--', alpha=0.5, label='Equality')
    plt.title('Lorenz Curve (Chat Inequality)')
    plt.xlabel('Cumulative % of Users')
    plt.ylabel('Cumulative % of Messages')
    plt.legend()
    plt.show()

    print(f"Top 5 Chatters:\n{user_counts.head(5)}")

## 5. Keyword & Transcript Deep Dive (New Feature)
Define a list of keywords to see where they appear in chat, and cross-reference with what was being said in the video.

In [None]:
# --- USER CONFIG ---
TARGET_KEYWORDS = ["lmao", "lol", "wow", "gg", "kekw", "wtf", "fuck"] 
MIN_HIT_THRESHOLD = 5 # Only show subtitles if keywords appear > this many times in a minute
# ------------------

if not parsed_chat_df.empty and not parsed_transcript_df.empty:
    
    # 1. Filter chat for keywords
    keyword_hits = []
    
    print("Scanning chat for keywords...")
    for idx, row in parsed_chat_df.iterrows():
        msg_lower = str(row['message']).lower()
        minute = row['minute']
        
        for kw in TARGET_KEYWORDS:
            if kw in msg_lower:
                keyword_hits.append({
                    'minute': minute,
                    'keyword': kw
                })
    
    df_hits = pd.DataFrame(keyword_hits)
    
    if not df_hits.empty:
        # 2. Pivot for Stacked Bar Chart
        pivot_df = df_hits.groupby(['minute', 'keyword']).size().unstack(fill_value=0)
        
        all_minutes = range(int(parsed_chat_df['minute'].min()), int(parsed_chat_df['minute'].max()) + 1)
        pivot_df = pivot_df.reindex(all_minutes, fill_value=0)

        # 3. Plot
        ax = pivot_df.plot(kind='bar', stacked=True, figsize=(18, 8), width=1.0, colormap='viridis')
        
        ticks = ax.xaxis.get_ticklocs()
        ticklabels = [l.get_text() for l in ax.xaxis.get_ticklabels()]
        ax.xaxis.set_ticks(ticks[::10])
        ax.xaxis.set_ticklabels(ticklabels[::10], rotation=0)
        
        plt.title(f'Keyword Frequency per Minute: {TARGET_KEYWORDS}', fontsize=16)
        plt.xlabel('Minute Offset')
        plt.ylabel('Frequency')
        plt.legend(title='Keywords')
        plt.show()
        
        # 4. Context Extraction (Subtitles)
        minutes_meeting_threshold_mask = (pivot_df >= MIN_HIT_THRESHOLD).any(axis=1)
        
        
        # Sort the significant ones by activity
        top_minutes = pivot_df[minutes_meeting_threshold_mask].index.tolist()
        
        if top_minutes:
            print(f"\n--- TRANSCRIPT CONTEXT FOR TOP SPIKES (Threshold: {MIN_HIT_THRESHOLD}+ hits) ---")
            
            # --- USE NEW HELPER FUNCTION HERE ---
            transcript_context = extract_transcript_by_minutes(parsed_transcript_df, top_minutes)
            
            for m in sorted(top_minutes):
                breakdown = dict(pivot_df.loc[m][pivot_df.loc[m] > 0])
                print(f"\n[Minute {m}] Keywords: {breakdown}")
                print(f"{textwrap.fill(transcript_context[m], width=80, initial_indent='  ', subsequent_indent='  ')}")
        else:
            print(f"\nNo minutes found with more than {MIN_HIT_THRESHOLD} keyword hits. Try lowering the threshold.")
                
    else:
        print("No matches found for the provided keywords.")
else:
    print("Chat or Transcript data missing, cannot run Deep Dive.")

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
if not parsed_chat_df.empty:
    analyzer = SentimentIntensityAnalyzer()

    # 1. Calculate sentiment for each chat message
    parsed_chat_df['sentiment'] = parsed_chat_df['message'].apply(lambda msg: analyzer.polarity_scores(msg)['compound'])

    # 2. Aggregate metrics per minute
    messages_per_minute = parsed_chat_df.groupby('minute').agg(
        message_count=('message', 'size'),
        avg_sentiment=('sentiment', 'mean')
    ).reset_index()

    # 3. Calculate a highlight score
    # We normalize message count and sentiment to combine them.
    # A highlight is a combination of high activity and positive sentiment.
    msg_count_norm = (messages_per_minute['message_count'] - messages_per_minute['message_count'].min()) / (messages_per_minute['message_count'].max() - messages_per_minute['message_count'].min())
    sentiment_norm = (messages_per_minute['avg_sentiment'] - messages_per_minute['avg_sentiment'].min()) / (messages_per_minute['avg_sentiment'].max() - messages_per_minute['avg_sentiment'].min())

    # Combine metrics. Adjust weighting as needed.
    messages_per_minute['highlight_score'] = (0.7 * msg_count_norm) + (0.3 * sentiment_norm)

    # 4. Find peaks in the highlight score
    highlight_threshold = messages_per_minute['highlight_score'].mean() + 1.5 * messages_per_minute['highlight_score'].std()
    peaks, _ = find_peaks(messages_per_minute['highlight_score'], height=highlight_threshold)
    highlight_minutes = messages_per_minute.loc[peaks, 'minute'].tolist()

    # 5. Visualization
    plt.figure(figsize=(18, 8))
    plt.plot(messages_per_minute['minute'], messages_per_minute['highlight_score'], label='Highlight Score', color='blue', zorder=2)
    plt.scatter(messages_per_minute.loc[peaks, 'minute'], messages_per_minute.loc[peaks, 'highlight_score'], color='red', s=100, label='Detected Highlights', zorder=5, marker='*')
    plt.axhline(y=highlight_threshold, color='gray', linestyle='--', label='Highlight Threshold')
    plt.title('Automated Highlight Detection', fontsize=16)
    plt.xlabel('Minute of Stream')
    plt.ylabel('Calculated Highlight Score')
    plt.legend()
    plt.grid(True, which='both', linestyle='--', linewidth=0.5)
    plt.tight_layout()
    plt.show()

    # 6. Print context for highlights
    if not parsed_transcript_df.empty and highlight_minutes:
        print(f'--- CONTEXT FOR DETECTED HIGHLIGHTS ---')
        for minute in sorted(highlight_minutes):
            start_sec = minute * 60
            end_sec = (minute + 1) * 60
            mask = (parsed_transcript_df['offset_start_seconds'] >= start_sec) & (parsed_transcript_df['offset_start_seconds'] < end_sec)
            segment = parsed_transcript_df[mask]
            transcript_text = " ".join(segment['text'].tolist()).strip() if not segment.empty else "(No speech detected)"
            print(f'[Minute {minute}]')
            print(f'  Streamer said: {textwrap.fill(transcript_text, width=80, initial_indent="    ", subsequent_indent="    ")}')
else:
    print('Chat data is not available, cannot perform highlight detection.')


## 6. Topic Detection (New Feature)
Automatically detect topic changes and generate timestamps without using LLMs. Uses multiple methods including keyword detection, TF-IDF clustering, and vocabulary change analysis.

In [None]:
# Import topic detection module
import topic_detector

# --- TOPIC DETECTION CONFIGURATION ---
# Choose which methods to use: 'keyword', 'tfidf', 'vocabulary', or combinations
TOPIC_METHODS = ['keyword', 'tfidf', 'vocabulary']  # Try different combinations
MIN_CONFIDENCE = 0.3  # Minimum confidence threshold
SHOW_TOPICS = 10  # Maximum number of topics to display
CONTEXT_MINUTES = 2  # Minutes of context to show around each topic

if not parsed_transcript_df.empty:
    print("=== TOPIC DETECTION ANALYSIS ===")
    print(f"Transcript duration: {parsed_transcript_df['offset_start_seconds'].max() / 60:.1f} minutes")
    print(f"Total transcript entries: {len(parsed_transcript_df)}")
    print()
    
    # Test different method combinations
    for method_combo in TOPIC_METHODS:
        if isinstance(method_combo, str):
            methods = [method_combo]
        else:
            methods = method_combo
        
        print(f"\n--- Using methods: {', '.join(methods)} ---")
        
        # Generate topic timestamps
        topics_df = topic_detector.generate_topic_timestamps(parsed_transcript_df, methods=methods)
        
        if topics_df.empty:
            print("No topics detected with current methods.")
            continue
        
        # Display results
        print(f"Detected {len(topics_df)} topic changes:")
        
        # Show top topics with context
        top_topics = topics_df.head(SHOW_TOPICS)
        if not top_topics.empty:
            topic_summaries = topic_detector.extract_topic_summary(parsed_transcript_df, top_topics)
            
            for i, summary in enumerate(topic_summaries, 1):
                print(f"\n{i+1}. {summary['timestamp_readable']} (confidence: {summary['confidence']:.2f}, method: {summary['method']})")
                print(f"   Keywords: {', '.join(summary['keywords'][:5])}")
                print(f"   Preview: {summary['text_preview'][:100]}...")
                print(f"   Context: {summary['full_context'][:200]}...")
        
        # Optional: Show all topics in a table format
        if len(topics_df) > SHOW_TOPICS:
            print(f"\n... and {len(topics_df) - SHOW_TOPICS} more topics detected")
            print("\nFull topic table:")
            display_cols = ['timestamp_readable', 'confidence', 'method', 'text']
            print(topics_df[display_cols].head(SHOW_TOPICS + 5).to_string(index=False))
else:
    print("No transcript data available for topic detection.")

print("\n=== Topic Detection Complete ===")

## 7. DaVinci Resolve XML Export
Generate a DaVinci Resolve compatible XML file for the detected highlights with 5-minute padding and overlap merging.

In [28]:
from datetime import datetime, timedelta

def generate_edl(highlight_minutes, output_filename="highlights.edl", padding_minutes=5, fps=30, offset_seconds=0):
    """
    Generate EDL (Edit Decision List) for highlights.
    
    Args:
        highlight_minutes (list): List of minute offsets for highlights
        output_filename (str): Output EDL filename
        padding_minutes (int): Minutes to pad before and after each highlight
        fps (int): Frames per second for timecode calculation
    """
    if not highlight_minutes:
        print("No highlights to export.")
        return
    
    # Sort highlights
    highlight_minutes = sorted(highlight_minutes)
    
    # Create time ranges with padding
    time_ranges = []
    for minute in highlight_minutes:
        start_time = max(0, minute - padding_minutes) * 60  # Convert to seconds
        end_time = (minute + padding_minutes) * 60
        time_ranges.append((start_time, end_time))
    
    # Merge overlapping ranges
    merged_ranges = []
    if time_ranges:
        time_ranges.sort()
        current_start, current_end = time_ranges[0]
        
        for start, end in time_ranges[1:]:
            if start <= current_end:  # Overlapping or adjacent
                current_end = max(current_end, end)  # Extend current range
            else:
                merged_ranges.append((current_start, current_end))
                current_start, current_end = start, end
        
        merged_ranges.append((current_start, current_end))
    
    # Generate EDL content
    edl_lines = []
    
    # EDL header
    edl_lines.append("TITLE: Highlights")
    edl_lines.append("FCM: NON-DROP FRAME")
    edl_lines.append("")
    
    # Add each clip as an EDL event
    current_timeline_time = 0  # Start at beginning of timeline
    
    for i, (start_sec, end_sec) in enumerate(merged_ranges, 1):
        # Calculate duration
        duration_sec = end_sec - start_sec
        
        # Convert to timecode (HH:MM:SS:FF)
        source_start_tc = seconds_to_timecode(max(0, start_sec - offset_seconds), fps)
        source_end_tc = seconds_to_timecode(max(0, end_sec - offset_seconds), fps)
        timeline_start_tc = seconds_to_timecode(current_timeline_time, fps)
        timeline_end_tc = seconds_to_timecode(current_timeline_time + duration_sec, fps)
        
        # EDL event number (padded to 3 digits)
        event_num = f"{i:03d}"
        
        # Clip name
        clip_name = f"HIGHLIGHT_{i}"
        
        # EDL event lines
        edl_lines.append(f"{event_num}  {clip_name}     V     C        {source_start_tc} {source_end_tc} {timeline_start_tc} {timeline_end_tc}")
        edl_lines.append(f"* FROM CLIP NAME: {clip_name}")
        edl_lines.append(f"* COMMENT: Highlight segment {i} - {seconds_to_readable(start_sec)} to {seconds_to_readable(end_sec)}")
        edl_lines.append("")
        
        # Update timeline position
        current_timeline_time += duration_sec
    
    # Write EDL file
    with open(output_filename, 'w', encoding='utf-8') as f:
        f.write('\n'.join(edl_lines))
    
    print(f"Generated EDL: {output_filename}")
    print(f"Exported {len(merged_ranges)} highlight segments:")
    
    for i, (start_sec, end_sec) in enumerate(merged_ranges, 1):
        start_min = int(start_sec // 60)
        start_sec_rem = int(start_sec % 60)
        end_min = int(end_sec // 60)
        end_sec_rem = int(end_sec % 60)
        duration_min = int((end_sec - start_sec) // 60)
        duration_sec = int((end_sec - start_sec) % 60)
        
        print(f"  {i}. {start_min:02d}:{start_sec_rem:02d} - {end_min:02d}:{end_sec_rem:02d} (duration: {duration_min:02d}:{duration_sec:02d})")

def seconds_to_timecode(seconds, fps=30):
    """
    Convert seconds to EDL timecode format (HH:MM:SS:FF).
    """
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    frames = int((seconds % 1) * fps)
    
    return f"{hours:02d}:{minutes:02d}:{secs:02d}:{frames:02d}"

def seconds_to_readable(seconds):
    """
    Convert seconds to readable time format.
    """
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    
    if hours > 0:
        return f"{hours:02d}:{minutes:02d}:{secs:02d}"
    else:
        return f"{minutes:02d}:{secs:02d}"

# Generate EDL if highlights were detected
if 'highlight_minutes' in locals() and highlight_minutes:
    output_edl_file = f"highlights_{YT_ID}.edl"
    
    # Check if reshaping was applied and get offset
    offset_seconds = 0
    if 'start_time_seconds' in locals():
        offset_seconds = start_time_seconds
        print(f"Applying offset of {offset_seconds} seconds ({offset_seconds//60:.0f}:{offset_seconds%60:02d}) from reshaping")
    
    generate_edl(highlight_minutes, output_edl_file, padding_minutes=5, offset_seconds=offset_seconds)
    
    print(f"\nEDL file saved as: {output_edl_file}")
    print("Import this file into DaVinci Resolve using File > Import Timeline > EDL...")
    print("Or import into other NLEs like Premiere Pro, Final Cut Pro, etc.")
else:
    print("No highlights detected. Run the highlight detection cell first.")

Applying offset of 4440 seconds (74:00) from reshaping
Generated EDL: highlights_p8sR5q7OGBk.edl
Exported 6 highlight segments:
  1. 71:00 - 86:00 (duration: 15:00)
  2. 87:00 - 100:00 (duration: 13:00)
  3. 115:00 - 125:00 (duration: 10:00)
  4. 150:00 - 160:00 (duration: 10:00)
  5. 189:00 - 199:00 (duration: 10:00)
  6. 205:00 - 217:00 (duration: 12:00)

EDL file saved as: highlights_p8sR5q7OGBk.edl
Import this file into DaVinci Resolve using File > Import Timeline > EDL...
Or import into other NLEs like Premiere Pro, Final Cut Pro, etc.
