In [1]:
%pip install nltk
import nltk
nltk.download('punkt')
%pip install tiktoken
import re
import json
import tiktoken
from nltk.tokenize import sent_tokenize

def read_vtt(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    text_lines = []
    for line in lines:
        if '-->' in line or line.strip().isdigit():
            continue
        line = line.strip()
        if line and not line.startswith('WEBVTT'):
            text_lines.append(line)
    
    return " ".join(text_lines)

def clean_transcript(text):
    text = re.sub(r'\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}', '', text)
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\s+', ' ', text).strip() 
    
    return text.lower()

def chunk_text(text, max_tokens=512):
    enc = tiktoken.encoding_for_model("gpt-4")
    sentences = sent_tokenize(text)
    
    chunks = []
    current_chunk = []
    current_length = 0
    
    for sentence in sentences:
        token_length = len(enc.encode(sentence))
        if current_length + token_length > max_tokens:
            if current_chunk:  
                chunks.append(" ".join(current_chunk))
            current_chunk = [sentence] 
            current_length = token_length
        else:
            current_chunk.append(sentence)
            current_length += token_length
    
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    
    return chunks

def preprocess_vtt(file_path, output_json=False):
    raw_text = read_vtt(file_path)
    clean_text = clean_transcript(raw_text)
    chunks = chunk_text(clean_text)
    
    if output_json:
        output = {"chunks": chunks}
        json_output = json.dumps(output, indent=4)
        return json_output
    else:
        return "\n\n".join(chunks)

if __name__ == "__main__":
    file_path = "Transcript.vtt"
    processed_text = preprocess_vtt(file_path, output_json=False)
    
    with open("processed_transcript.txt", "w", encoding="utf-8") as f:
        f.write(processed_text)
    
    print("Preprocessing complete.")

Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\s-lch\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


Collecting tiktoken
  Downloading tiktoken-0.9.0-cp312-cp312-win_amd64.whl.metadata (6.8 kB)
Downloading tiktoken-0.9.0-cp312-cp312-win_amd64.whl (894 kB)
   ---------------------------------------- 0.0/894.9 kB ? eta -:--:--
   ---------------------------------------- 894.9/894.9 kB 8.1 MB/s eta 0:00:00
Installing collected packages: tiktoken
Successfully installed tiktoken-0.9.0
Note: you may need to restart the kernel to use updated packages.


FileNotFoundError: [Errno 2] No such file or directory: 'Transcript.vtt'