In [3]:
# screenplays
from bertopic import BERTopic
import os

def load_screenplays_from_folder(folder_path):
    screenplays = []
    filenames = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r", encoding="utf-8") as f:
                text = f.read()
                screenplays.append(text)
                filenames.append(filename)
    return screenplays, filenames

# Define paths
train_path = os.path.join("..", "data", "screenplays", "train")
test_path = os.path.join("..", "data", "screenplays", "test")

# Load data
X_train, train_filenames = load_screenplays_from_folder(train_path)
X_test, test_filenames = load_screenplays_from_folder(test_path)

# Quick checks
print(f"Loaded {len(X_train)} training screenplays")
print(f"Loaded {len(X_test)} test screenplays")


Loaded 98 training screenplays
Loaded 11 test screenplays


### LOAD WORDS TO REMOVE FROM SCREENPLAY
- stopwords
- 200 most common words in screenplays

In [47]:
# load character names
import nltk
from nltk.corpus import stopwords
import json
import re

def load_character_names(movie_title):
    with open(f"../data/movie_data/{movie_title.replace(' ', '_')}.json", "r", encoding="utf-8") as file2:
        character_names = json.load(file2)
        
    characters = [dicty["character"] for dicty in character_names["actors_characters"]]
    characters_cleaned = []
    for char in characters:
         names = char.split(" ")
         for name in names: 
             name = re.sub(r"[^a-z]", "", name.lower())
             if name != "":
                 characters_cleaned.append(name)

    return characters_cleaned

def load_words(top_n_common=50):  # previously 200
    # Load word frequency list
    with open("../data/other/word_frequencies.json", "r", encoding="utf-8") as file:
        word_frequencies = json.load(file)

    # Keep only top 50 frequent words instead of 200
    common_words = list(word_frequencies.keys())[:top_n_common]

    # Limit built-in stopwords — keep pronouns, conjunctions, etc.
    minimal_stopwords = {"the", "a", "an", "of", "and", "in", "on", "at", "to"}

    return common_words + list(minimal_stopwords)

    
words_to_remove = load_words(top_n_common=200)
#characters = load_character_names(movie_title="Die Hard")
print("Words loaded.\n")

Words loaded.



### CLEAN TEXT
- Lowercase, remove non-letter characters
- filter out common words and stopwords
- filter out proper nouns and family names via NER

In [22]:
# clean text
#!pip install spacy
#!python -m spacy download en_core_web_sm

import spacy

# Load spaCy NER model (do this once at the top of your notebook)
nlp = spacy.load("en_core_web_sm")  # You can upgrade to en_core_web_trf if needed

# Define family words that may not be tagged as entities
FAMILY_TERMS = {
    "mom", "dad", "mother", "father", "sister", "brother",
    "uncle", "aunt", "grandma", "grandpa", "cousin", "stepmom", "stepdad"
}

def load_nouns(screenplay_text=None, movie_title=None):
    """
    Extracts PERSON entities and family-related nouns from a screenplay.
    Returns a list of lowercase words to remove.
    """
    if screenplay_text is None:
        raise ValueError("You must pass in screenplay_text for NER.")

    doc = nlp(screenplay_text)

    # Extract named entities of type PERSON
    names = {ent.text.lower() for ent in doc.ents if ent.label_ == "PERSON"}

    # Add family-related words (case insensitive)
    family_words = FAMILY_TERMS

    # Merge and return as list
    nouns_to_remove = list(names.union(family_words))
    return nouns_to_remove


def clean_and_chunk_text(screenplay_text, movie_title, top_n_common=200, verbose=True, chunk_size=35):
    cleaned_words = screenplay_text.lower().replace("\n", " ")                
    cleaned_words = [re.sub(r"[^a-zA-Z]", "", word.strip()) for word in cleaned_words.split(" ") if word.strip() != ""]
    
    # Filter out stop words, character names, and common words
    words_to_remove = load_words(top_n_common=200)
    names_to_remove = load_nouns(screenplay_text=screenplay_text, movie_title=movie_title)
    filtered_words = [word for word in cleaned_words if word not in words_to_remove and word not in names_to_remove]

    # filter out short words
    filtered_words = [word for word in filtered_words if len(word) > 2]

    # chunks
    chunks = [" ".join(filtered_words[i:i+chunk_size]) for i in range(0, len(filtered_words), chunk_size)]

    if verbose:
        print(f"Original words: {len(cleaned_words)}, Filtered words: {len(filtered_words)}")
        print(f"Total chunks: {len(chunks)}")

    return chunks

sample = """
INT. DINER – NIGHT

JOHN sits across from MARY. His face is bruised.

JOHN
(quietly)
I didn’t think I’d see you again.

MARY
You don’t look so good.

The WAITRESS drops off the check.
"""

print(load_nouns(sample))


['father', 'sister', 'stepmom', 'grandma', 'mother', 'grandpa', 'uncle', 'mary', 'john', 'cousin', 'stepdad', 'brother', 'mom', 'dad', 'aunt']


### RUN BERTOPIC

In [24]:
#!pip install matplotlib
import hdbscan
from umap import UMAP
import matplotlib.pyplot as plt
import plotly.io as pio

def create_plots(topic_model, label="train_corpus", top_n_topics=10):
    # barcharts
    fig = topic_model.visualize_barchart(top_n_topics=top_n_topics)
    fig.update_layout(title_text=f"Top {top_n_topics} BERTopic distributions in {label}", title_x=0.5)
    fig.write_html(f"plots/{label}_bertopic_barchart.html")

    # intertopic distances
    fig2 = topic_model.visualize_topics()
    fig2.update_layout(title_text=f"Intertopic Distance Map for {label}", title_x=0.5)
    fig2.write_html(f"plots/{label}_intertopic_distance.html")

    # dendrogram
    fig3 = topic_model.visualize_hierarchy()
    fig3.write_html(f"plots/{label}_dendrogram.html")

    print("Plots saved.")


def create_model_and_plots(chunks, embeddings, label="train_corpus"):
    hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=3,
                                    min_samples=1,
                                    cluster_selection_epsilon=0.1)

    umap_model = UMAP(n_components=10, n_neighbors=15, min_dist=0.05, metric='cosine')

    topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model)
    topics, probs = topic_model.fit_transform(chunks, embeddings)

    create_plots(topic_model, label=label, top_n_topics=10)


In [46]:
import re
from sentence_transformers import SentenceTransformer
import time

def overlapping_chunks(words, chunk_size=250, overlap=50):
    step = chunk_size - overlap
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), step)]

start_time = time.time()

embedder = SentenceTransformer("all-MiniLM-L6-v2")
chunks = []

for screenplay_text, filename in zip(X_train, train_filenames):
    movie_title = filename.replace(".txt", "")
    
    if screenplay_text:
        cleaned = clean_and_chunk_text(screenplay_text, movie_title, top_n_common=200, verbose=False, chunk_size=99999)
        words = " ".join(cleaned).split()
        all_chunks = overlapping_chunks(words, chunk_size=250, overlap=50)

        # Drop the first chunk if it looks like metadata
        if len(all_chunks) > 1:
            chunks += all_chunks[1:]
        else:
            chunks += all_chunks

embeddings = embedder.encode(chunks, show_progress_bar=True, batch_size=256)

end_time = time.time()
print(f"Time: {round(end_time - start_time, 2)} seconds\nEmbeddings created!")

Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Time: 433.87 seconds
Embeddings created!


In [49]:
from umap import UMAP
import hdbscan
from bertopic import BERTopic

start_time = time.time()

hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=20,
    min_samples=5,
    cluster_selection_epsilon=0.05,
    prediction_data=True  # <-- REQUIRED for topic_model to calculate probabilities
)

umap_model = UMAP(
    n_components=5,                # fewer dims = tighter clusters
    n_neighbors=30,                # smoother local structure
    min_dist=0.2,
    metric='cosine'
)

topic_model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    calculate_probabilities=True,
    verbose=True
)

topics, probs = topic_model.fit_transform(chunks, embeddings)

end_time = time.time()
print(f"Time: {round(end_time - start_time, 2)} seconds\nBERTopic trained!")

2025-04-05 14:24:10,583 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-05 14:24:12,194 - BERTopic - Dimensionality - Completed ✓
2025-04-05 14:24:12,195 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-05 14:24:12,923 - BERTopic - Cluster - Completed ✓
2025-04-05 14:24:12,926 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-05 14:24:13,907 - BERTopic - Representation - Completed ✓


Time: 4.05 seconds
BERTopic trained!


In [None]:
topic_model.get_topic_info()

In [50]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Get topic-word matrix
topics = topic_model.get_topics()
top_n = 10

# Filter out -1 topic (outliers)
valid_topics = [k for k in topics.keys() if k != -1]

# 1. Topic Coherence (avg cosine sim of top words)
def topic_coherence(topics_dict, top_n=10):
    vectorizer = CountVectorizer()
    coherence_scores = []
    for topic in valid_topics:
        words = [word for word, _ in topics_dict[topic][:top_n]]
        if len(words) < 2:
            continue
        X = vectorizer.fit_transform(words).toarray()
        sim = cosine_similarity(X)
        upper_tri = sim[np.triu_indices_from(sim, k=1)]
        coherence_scores.append(np.mean(upper_tri))
    return np.mean(coherence_scores)

# 2. Topic Diversity (fraction of unique words in top-N)
def topic_diversity(topics_dict, top_n=10):
    all_words = []
    for topic in valid_topics:
        all_words.extend([word for word, _ in topics_dict[topic][:top_n]])
    unique_words = set(all_words)
    return len(unique_words) / (top_n * len(valid_topics))

coherence = topic_coherence(topics, top_n=top_n)
diversity = topic_diversity(topics, top_n=top_n)

print(f"Coherence Score (avg top-{top_n} word similarity): {coherence:.4f}")
print(f"Diversity Score (unique top-{top_n} words): {diversity:.4f}")


Coherence Score (avg top-10 word similarity): 0.0000
Diversity Score (unique top-10 words): 0.9216


In [44]:
for i in range(5):
    print(f"\n--- Chunk {i} ---\n{chunks[i]}")



--- Chunk 0 ---
space odyssey screenplay stanley kubrick arthur clark hawk films ltd mgm studios boreham wood herts title part africa years ago views african drylands drought remorseless drought lasted ten million years million reign ter rible lizards since passed continent known africa battle survival reached climax ferocity victor yet sight dry barren land swift fierce flourish hope exist caves manapes field none attributes pathetic road racial extinction twenty occupied group caves overlooking parched valley divided sluggish brown stream tribe always hungry starving dim glow dawn creeps discovers died relationship beyond understanding emac iated feels akin sadness carries leaves hyenas among kind almost giant nearly five high though badly undernourished weighs hundred pounds hairy muscular quite manlike already nearer ape forehead low ridges eyesockets yet unmistakably genes promise humanity upon hostile world already gaze beyond grasp ape dark deepset dawning awarenessthe intima t

In [45]:
print(f"Total chunks: {len(chunks)}")
print(f"Embeddings shape: {embeddings.shape}")

Total chunks: 4406
Embeddings shape: (4406, 384)
