# Subtitle Data Exploration

This notebook explores the subtitle files in the data directory and analyzes their content for insights.

In [None]:
# Import necessary libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from pathlib import Path
from collections import defaultdict
import sys

# Add project root to path
sys.path.insert(0, str(Path().absolute().parent))

# Download NLTK resources if needed
nltk.download('punkt')
nltk.download('stopwords')

## 1. Load and Examine Subtitle Files

In [None]:
# Function to read .srt files
def read_srt_file(file_path):
    """Read and parse an SRT file to extract subtitle text."""
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        content = f.read()
    
    # Extract subtitle text (remove timestamps and numbers)
    pattern = r'\d+\n\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\n'
    content = re.sub(pattern, '', content)
    
    # Remove empty lines
    lines = [line for line in content.split('\n') if line.strip()]
    
    return ' '.join(lines)

# Get all subtitle files
data_dir = Path().absolute().parent / 'data'
srt_files = list(data_dir.glob('*.srt'))

print(f"Found {len(srt_files)} subtitle files in {data_dir}")

In [None]:
# Load subtitle data
subtitle_data = []

for file_path in srt_files:
    try:
        content = read_srt_file(file_path)
        subtitle_data.append({
            'file_name': file_path.name,
            'content': content,
            'word_count': len(content.split()),
            'file_size_kb': file_path.stat().st_size / 1024
        })
    except Exception as e:
        print(f"Error reading {file_path.name}: {e}")

# Create DataFrame
df_subtitles = pd.DataFrame(subtitle_data)
df_subtitles.head()

## 2. Data Analysis

In [None]:
# Basic statistics
if not df_subtitles.empty:
    print(f"Total subtitle files: {len(df_subtitles)}")
    print(f"Total word count: {df_subtitles['word_count'].sum():,}")
    print(f"Average word count per file: {df_subtitles['word_count'].mean():.1f}")
    print(f"Average file size: {df_subtitles['file_size_kb'].mean():.1f} KB")
else:
    print("No subtitle files found. Please add .srt files to the data directory.")

In [None]:
# Visualize file sizes
if not df_subtitles.empty:
    plt.figure(figsize=(10, 6))
    sns.histplot(df_subtitles['file_size_kb'], bins=20, kde=True)
    plt.title('Distribution of Subtitle File Sizes')
    plt.xlabel('File Size (KB)')
    plt.ylabel('Count')
    plt.grid(True, alpha=0.3)
    plt.show()

    plt.figure(figsize=(12, 6))
    sns.barplot(data=df_subtitles.sort_values('word_count', ascending=False).head(15), 
                x='file_name', y='word_count')
    plt.title('Top 15 Subtitle Files by Word Count')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

## 3. Text Analysis

In [None]:
# Function to preprocess text
def preprocess_text(text):
    """Clean and tokenize text."""
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and len(token) > 1]
    
    return tokens

# Process all subtitle content if available
if not df_subtitles.empty:
    # Combine all subtitle content
    all_content = ' '.join(df_subtitles['content'].tolist())
    
    # Preprocess text
    tokens = preprocess_text(all_content)
    
    # Frequency distribution
    fdist = FreqDist(tokens)
    
    # Plot most common words
    plt.figure(figsize=(12, 6))
    fdist.plot(30, cumulative=False)
    plt.title('30 Most Common Words in Subtitles')
    plt.show()

## 4. Advanced Analysis

Let's look at word co-occurrence and n-grams

In [None]:
from nltk import ngrams

# Generate and visualize bigrams
if 'tokens' in locals():
    # Get bigrams
    bigrams_list = list(ngrams(tokens, 2))
    bigram_freq = FreqDist(bigrams_list)
    
    # Plot top bigrams
    top_bigrams = bigram_freq.most_common(20)
    bigram_df = pd.DataFrame(top_bigrams, columns=['Bigram', 'Count'])
    bigram_df['Bigram'] = bigram_df['Bigram'].apply(lambda x: f"{x[0]} {x[1]}")
    
    plt.figure(figsize=(12, 6))
    sns.barplot(data=bigram_df, x='Count', y='Bigram')
    plt.title('Top 20 Bigrams in Subtitle Content')
    plt.tight_layout()
    plt.show()

## 5. Summary and Insights

In [None]:
# Create summary statistics
if not df_subtitles.empty:
    summary = {
        'Total Files': len(df_subtitles),
        'Total Words': df_subtitles['word_count'].sum(),
        'Avg Words per File': df_subtitles['word_count'].mean(),
        'Min Words': df_subtitles['word_count'].min(),
        'Max Words': df_subtitles['word_count'].max(),
        'Unique Words': len(set(tokens)) if 'tokens' in locals() else 'N/A',
        'Vocabulary Richness': len(set(tokens)) / len(tokens) if 'tokens' in locals() else 'N/A'
    }
    
    summary_df = pd.DataFrame(summary.items(), columns=['Metric', 'Value'])
    summary_df

## Next Steps

Now that we've explored the subtitle data, consider:

1. Adding more subtitle files to increase the corpus size
2. Creating a TF-IDF representation of documents for search
3. Implementing semantic embeddings for enhanced search
4. Building a search interface using Streamlit

See the `search_demo.ipynb` notebook for examples of using the search engine.