## TOPIC SEGEMENTATION AND IMPLEMENTATION

**MOUNT DRIVE**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**INSTALL DEPENDENCIES**

In [2]:
!pip install -q sentence-transformers openai transformers python-dotenv nltk scikit-learn pandas tqdm torch

**IMPORTS**

In [3]:
import os
import json
import time   # Used for time-related tasks (like delays or timing code)
from tqdm.auto import tqdm    # Shows a progress bar when running loops
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords   # Contains common words like "the", "is", "and"
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')
nltk.download('stopwords', quiet=True)

# Converts text into numerical form using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
# Used to measure similarity between texts
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer

from openai import OpenAI

from dotenv import load_dotenv

from transformers import pipeline   # Used to run pre-trained NLP models easily

import torch

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


**CHECK IF CUDA IS AVAILABLE**

In [4]:
import torch
print("PyTorch version:", torch.__version__)
print("CUDA available?:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))

PyTorch version: 2.9.0+cu126
CUDA available?: True
GPU name: Tesla T4


**LOAD OpenAI KEY FROM .env FILE**

In [5]:
load_dotenv("/content/drive/MyDrive/podcast-project/data/.env")

OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")

if not OPENROUTER_API_KEY:
    raise ValueError("OPENROUTER_API_KEY not found in .env file")

print("OpenRouter API key loaded (hidden)")

# Create client
from openai import OpenAI

llm_client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=OPENROUTER_API_KEY,
)

OpenRouter API key loaded (hidden)


**PATHS**

In [6]:
BASE_PATH = "/content/drive/MyDrive/podcast-project"
PROCESSED_DIR = f"{BASE_PATH}/data/transcripts_processed"
OUTPUT_DIR = f"{BASE_PATH}/data/segmented_outputs"
COMPARE_DIR = f"{BASE_PATH}/data/segmented_outputs/algorithm_comparison"

os.makedirs(OUTPUT_DIR, exist_ok=True)

**CONFIGURATION**

In [7]:
# Select which episodes to process
EPISODE_NUMBERS = list(range(181, 201))

BATCH_SIZE = 20   # Number of episodes to process at one time
COMPARE_EPISODES = min(5, len(EPISODE_NUMBERS))

# Minimum number of sentences required in an episode
MIN_SENTENCES = 10                          # lowered for testing

# Filename patterns
def input_filename(num):
    return f"episode_{num}_whisper.json"

def output_filename(num):
    return f"episode_{num}_segment.json"

# Build & validate file list
files_to_process = []
for num in EPISODE_NUMBERS:
    fname = input_filename(num)
    full_path = os.path.join(PROCESSED_DIR, fname)
    if os.path.exists(full_path):
        files_to_process.append((num, fname))
    else:
        print(f"Missing: {fname}")

print(f"\nWill process {len(files_to_process)} episodes")


Will process 20 episodes


**MODELS**

In [8]:
print("Loading models...")
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
device = 0 if torch.cuda.is_available() else -1   # Check if a GPU is available
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)

STOP_WORDS = set(stopwords.words('english'))

Loading models...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


**PREPARE TRANSCRIPT + SPLIT INTO SENTENCES**

In [9]:
def load_episode(num, filename):
    path = os.path.join(PROCESSED_DIR, filename)
    with open(path, 'r', encoding='utf-8') as f:    # Open and load the JSON file
        data = json.load(f)

    text_parts = []   # This list will store all text pieces found in the file

    # Case 1: Standard Whisper output with 'segments'
    if isinstance(data, dict) and 'segments' in data:
        for seg in data.get('segments', []):
            text = seg.get('text', '').strip()
            if text:
                text_parts.append(text)

    # Case 2: List of dicts
    elif isinstance(data, list):
        for item in data:
            text = item.get('text') or item.get('line') or item.get('content') or ""
            if text.strip():
                text_parts.append(text.strip())

    # Case 3: Single string under common keys
    elif isinstance(data, dict):
        for k in ['text', 'transcript', 'full_text', 'transcription']:
            if k in data and isinstance(data[k], str):
                text_parts.append(data[k].strip())
                break

    # Fallback
    if not text_parts:
        text_parts = [str(data)[:2000]]

    full_text = " ".join(text_parts).strip()
    sentences = sent_tokenize(full_text)

    # Timestamps (if available in segments)
    timestamps = [0.0] * len(sentences)
    if isinstance(data, dict) and 'segments' in data:
        seg_idx = 0
        sent_idx = 0
        while sent_idx < len(sentences) and seg_idx < len(data['segments']):
            seg = data['segments'][seg_idx]
            ts = seg.get('start') or seg.get('timestamp') or 0.0
            seg_text = seg.get('text', '')
            seg_sent_count = len(sent_tokenize(seg_text))
            for _ in range(min(seg_sent_count, len(sentences) - sent_idx)):
                timestamps[sent_idx] = ts
                sent_idx += 1
            seg_idx += 1

    ep_id = f"episode_{num}"
    print(f"  Loaded {ep_id:20} | {len(sentences):4,} sentences | text len: {len(full_text):,}")

    return ep_id, sentences, timestamps, full_text

**ALGORITHM 1 - BASELINE (SENTENCE SIMILARITY)**

In [10]:
"""
Algorithm 1 – Baseline segmentation
This method splits text when two nearby sentences are not very similar.
"""
def segment_baseline(sentences, threshold=0.23):
    if len(sentences) < MIN_SENTENCES:
        return [" ".join(sentences)], [0, len(sentences)]

    vec = TfidfVectorizer(stop_words='english')   # Convert sentences into numerical vectors using TF-IDF
    X = vec.fit_transform(sentences)

    boundaries = [0]    # Store the starting index of each segment

    # Compare each sentence with the previous one
    for i in range(1, len(sentences)):
        sim = cosine_similarity(X[i-1:i], X[i:i+1])[0][0]   # Measure how similar the two sentences are

        # If similarity is low, start a new segment
        if sim < threshold:
            boundaries.append(i)    # Add the end of the last segment
    boundaries.append(len(sentences))

    # Combine sentences into text segments using the boundaries
    segments = [" ".join(sentences[s:e]) for s, e in zip(boundaries[:-1], boundaries[1:])]
    return segments, boundaries

**ALGORITHM 2 - EMBEDDING-BASED**

In [11]:
"""
Algorithm 2 – Embedding-based segmentation (final chosen method)
This method splits text based on meaning similarity between sentence groups.
"""
def segment_embedding(sentences, window=5, threshold=0.48):

    # If there are too few sentences, return everything as one segment
    if len(sentences) < MIN_SENTENCES:
        return [" ".join(sentences)], [0, len(sentences)]

    # Convert each sentence into an embedding (numerical meaning vector)
    embs = embed_model.encode(sentences, show_progress_bar=False, batch_size=32)

    boundaries = [0]

    # Move a sliding window across the embeddings
    for i in range(window, len(embs)-window):
        left  = embs[i-window:i].mean(axis=0)   # Average embeddings for sentences before the current point
        right = embs[i:i+window].mean(axis=0)   # Average embeddings for sentences after the current point
        sim = cosine_similarity([left], [right])[0][0]
        if sim < threshold:
            boundaries.append(i)
    boundaries.append(len(embs))

    segments = [" ".join(sentences[s:e]) for s, e in zip(boundaries[:-1], boundaries[1:])]
    return segments, boundaries

**ALGORITHM 3 - LLM**

In [12]:
def segment_llm_openrouter(full_text):
    # Check if the OpenRouter LLM client is available or not
    if not llm_client:
        print("OpenRouter client not initialized – skipping LLM")
        return [0]

    # System message that tells the LLM exactly what to do
    system = """You are an expert podcast topic segmenter.
Given a transcript chunk, return ONLY a Python list of sentence indices (starting from 0) where a NEW topic begins.
Example: [0, 42, 118, 195]
No explanation, no extra text."""

    # User message containing the transcript text
    user = f"""Transcript:
{full_text[:90000]}   # Text is limited to the first 90,000 characters to avoid token limits

Return only the list of boundary indices."""

    try:
        # Send the request to OpenRouter using a free LLM model
        resp = llm_client.chat.completions.create(
            model="xiaomi/mimo-v2-flash:free",   # working free model (Jan 16, 2026)
            messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": user}
            ],
            temperature=0.1,    # Low temperature for stable, consistent output
            max_tokens=300,     # Enough tokens for a short list of numbers
        )

        # Extract the raw text returned by the model
        raw = resp.choices[0].message.content.strip()

        # Safely convert the returned string into a Python object
        import ast
        try:
            bounds = ast.literal_eval(raw)

            # Make sure the result is a list of integers
            if isinstance(bounds, list) and all(isinstance(x, int) for x in bounds):
                return bounds
        except:
            pass
        return [0]    # If parsing fails, return a default boundary
    except Exception as e:
        print(f"OpenRouter error: {e}")
        return [0]

**TEST OPENROUTER CONNECTION**

In [13]:
# Test OpenRouter connection & segmentation
test_text = "This is a test transcript. First part about weather. Second part about AI."
bounds = segment_llm_openrouter(test_text)
print("Test boundaries:", bounds)

OpenRouter error: Error code: 401 - {'error': {'message': 'User not found.', 'code': 401}}
Test boundaries: [0]


**TF-IDF KEYWORDS SEGMENT**

In [14]:
def get_keywords(text, top_n=6):
    # If the text is very short, keyword extraction is not useful
    if len(text.split()) < 20:
        return []

    # Create a TF-IDF vectorizer
    vec = TfidfVectorizer(stop_words=list(STOP_WORDS), max_features=250)
    try:
        X = vec.fit_transform([text])           # Convert the text into TF-IDF scores
        names = vec.get_feature_names_out()     # Get the list of words used by the vectorizer
        scores = X.toarray()[0]                 # Get the importance score for each word
        idx = scores.argsort()[-top_n:][::-1]   # Find the indices of the top N highest-scoring words
        return [names[i] for i in idx]          # Return the top keywords
    except:
        return []   # If anything goes wrong, return an empty list

**SHORT SUMMARIES (1–2 SENTENCES)**

In [15]:
def get_summary(text):
    # If the text is very short, return a shortened preview instead of summarizing
    if len(text.split()) < 50:
        return text[:220] + " …"
    input_len = len(text.split())

    # Decide the maximum length of the summary
    max_len = min(65, max(30, int(input_len * 0.6)))
    try:
        # Generate a summary using the summarization model
        return summarizer(text, max_length=max_len, min_length=max(15, max_len//2),
                          do_sample=False, truncation=True)[0]['summary_text']
    except:
        # If summarization fails, return a short preview instead
        return text[:220] + " …"

**MAIN PROCESSING**

In [16]:
print(f"Processing – {len(files_to_process)} episodes")
total_start = time.time()   # Record the overall start time

comparison_rows = []    # This list will store comparison results for analysis

# Process episodes in batches
for batch_idx, start in enumerate(range(0, len(files_to_process), BATCH_SIZE)):
    batch = files_to_process[start : start + BATCH_SIZE]
    print(f"\nBatch {batch_idx+1} ({len(batch)} episodes)")

    batch_start_time = time.time()

    # Loop through each episode in the batch with a progress bar
    for ep_num, fname in tqdm(batch, desc=f"Batch {batch_idx+1}"):
        ep_start = time.time()

        # Loop through each episode in the batch with a progress bar
        ep_id, sentences, timestamps, full_text = load_episode(ep_num, fname)

        # Skip episodes that are too short
        if len(sentences) < MIN_SENTENCES:
            print(f"  Skipping {ep_id} – too short ({len(sentences)} sentences)")
            continue

        # ---- Baseline segmentation (TF-IDF similarity) ----
        segs_base, _ = segment_baseline(sentences)

        # ---- Embedding-based segmentation (final method) ----
        segs_emb, bounds_emb = segment_embedding(sentences)

        # ---- LLM-based segmentation ----
        llm_segments_count = 0
        if len(comparison_rows) < COMPARE_EPISODES:
            llm_bounds = segment_llm_openrouter(full_text)
            llm_segments_count = len(llm_bounds) - 1 if llm_bounds else 0

        # Comparison data
        comparison_rows.append({
            "episode": ep_id,
            "sentences": len(sentences),
            "baseline_segments": len(segs_base),
            "embedding_segments": len(segs_emb),
            "llm_segments": llm_segments_count,
            "time_sec": round(time.time() - ep_start, 1)
        })

        # Final output – embedding method
        output = {
            "episode_id": ep_id,
            "source_file": fname,
            "total_sentences": len(sentences),
            "algorithm_used": "embedding-based (Sentence Transformers)",
            "segments": []
        }

        # Build each segment using the computed boundaries
        for i, (s, e) in enumerate(zip(bounds_emb[:-1], bounds_emb[1:]), 1):
            # Combine sentences for this segment
            seg_text = " ".join(sentences[s:e])

            # Get start and end timestamps (if available)
            start_t = timestamps[s] if s < len(timestamps) else 0.0
            end_t   = timestamps[e-1] if e-1 < len(timestamps) else start_t

            # Add segment details to output
            output["segments"].append({
                "segment_id": i,
                "start_sentence_idx": s,
                "end_sentence_idx": e,
                "num_sentences": e - s,
                "start_time_sec": round(float(start_t), 2),
                "end_time_sec": round(float(end_t), 2),
                "text_preview": seg_text[:280] + "…" if len(seg_text) > 280 else seg_text,
                "keywords": get_keywords(seg_text),
                "summary": get_summary(seg_text)
            })

        # Save the final segmented output as a JSON file
        out_path = os.path.join(OUTPUT_DIR, output_filename(ep_num))
        with open(out_path, 'w', encoding='utf-8') as f:
            json.dump(output, f, indent=2, ensure_ascii=False)

        # Print timing and segmentation summary for this episode
        duration = time.time() - ep_start
        print(f"  {ep_id:20} | emb: {len(segs_emb):2d} seg | llm: {llm_segments_count:2d} | {duration:.1f} s")

    # Print how long the batch took to process
    batch_duration = time.time() - batch_start_time
    print(f"Batch {batch_idx+1} finished in {batch_duration/60:.1f} min")

Processing – 20 episodes

Batch 1 (20 episodes)


Batch 1:   0%|          | 0/20 [00:00<?, ?it/s]

  Loaded episode_181          |  706 sentences | text len: 47,151


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


OpenRouter error: Error code: 401 - {'error': {'message': 'User not found.', 'code': 401}}


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


  episode_181          | emb: 437 seg | llm:  0 | 24.9 s
  Loaded episode_182          |  762 sentences | text len: 39,047
OpenRouter error: Error code: 401 - {'error': {'message': 'User not found.', 'code': 401}}
  episode_182          | emb: 503 seg | llm:  0 | 13.9 s
  Loaded episode_183          |  741 sentences | text len: 51,409
OpenRouter error: Error code: 401 - {'error': {'message': 'User not found.', 'code': 401}}
  episode_183          | emb: 491 seg | llm:  0 | 20.3 s
  Loaded episode_184          |  831 sentences | text len: 48,231
OpenRouter error: Error code: 401 - {'error': {'message': 'User not found.', 'code': 401}}
  episode_184          | emb: 544 seg | llm:  0 | 22.6 s
  Loaded episode_185          |  758 sentences | text len: 47,085
OpenRouter error: Error code: 401 - {'error': {'message': 'User not found.', 'code': 401}}
  episode_185          | emb: 478 seg | llm:  0 | 21.6 s
  Loaded episode_186          |  796 sentences | text len: 47,741
  episode_186        

**SAVE COMPARISON FILE**

In [None]:
# Check if there is any comparison data to save
if comparison_rows:
    # Convert the comparison results into a DataFrame
    df_compare = pd.DataFrame(comparison_rows)

    # Create the output file path for the comparison results
    compare_path = os.path.join(COMPARE_DIR, "segmentation_algorithm_comparison.json")

    # Save the comparison data as a JSON file
    df_compare.to_json(compare_path, orient="records", indent=2)
    print(f"\nComparison saved → {compare_path}")
    print(df_compare.to_string(index=False))
else:
    print("\nNo episodes were processed → no comparison file created")

# Calculate total processing time
total_duration = time.time() - total_start
print(f"\nTotal time: {total_duration/60:.1f} minutes")   # Print total runtime in minutes
print("Output folder:", COMPARE_DIR)


Comparison saved → /content/drive/MyDrive/podcast-project/data/segmented_outputs/algorithm_comparison/segmentation_algorithm_comparison.json
    episode  sentences  baseline_segments  embedding_segments  llm_segments  time_sec
episode_101        763                705                 553             0       8.0
episode_102        799                728                 476             0       7.9
episode_103        798                727                 487             0      10.8
episode_104        629                605                 445             0      11.7
episode_105        790                723                 432             0       8.3
episode_106        829                735                 492             0       1.6
episode_107        604                571                 389             0       1.2
episode_108        739                664                 437             0       1.4
episode_109        879                786                 573             0       1.