## Imports

In [1]:
import re
from collections import defaultdict

import matplotlib.pyplot as plt
import nltk
import pandas as pd
import plotly.graph_objects as go
import plotly.offline as opy
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.util import ngrams
from tqdm import tqdm

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/daniel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Get Movie Script

In [2]:
# set ngram number
ngram_length = 4

# get movie script and tokenize
script = pd.read_csv("./data/lotr_script_extended.csv").drop("Unnamed: 0", axis=1)
script["scene_num"] = script["scene"].factorize()[0]
script["tokenized_line"] = [nltk.flatten([word_tokenize(sent) for sent in sent_tokenize(line)]) for line in script["text"]]
script["tokenized_line_lower"] = [nltk.flatten([list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(line)]) for line in script["text"]]
script["num_words"] = script["tokenized_line"].str.len()
script["num_words_ngrams"] = script["tokenized_line"].str.len() - 2 * ngram_length + 2
script


Unnamed: 0,character,text,scene,movie,scene_num,tokenized_line,tokenized_line_lower,num_words,num_words_ngrams
0,GALADRIEL VOICE OVER,I amar prestar aen . Han mathon ne nen Han ma...,Prologue: One Ring to Rule Them All...,The Fellowship of the Ring,0,"[I, amar, prestar, aen, ., Han, mathon, ne, ne...","[i, amar, prestar, aen, ., han, mathon, ne, ne...",33,27
1,GALADRIEL VOICE OVER,It began with the forging of the Great Rings.,Prologue: One Ring to Rule Them All...,The Fellowship of the Ring,0,"[It, began, with, the, forging, of, the, Great...","[it, began, with, the, forging, of, the, great...",10,4
2,GALADRIEL VOICE OVER,"Three were given to the elves. Immortal, wise...",Prologue: One Ring to Rule Them All...,The Fellowship of the Ring,0,"[Three, were, given, to, the, elves, ., Immort...","[three, were, given, to, the, elves, ., immort...",52,46
3,GALADRIEL VOICE OVER,For within these rings was bound the strength ...,Prologue: One Ring to Rule Them All...,The Fellowship of the Ring,0,"[For, within, these, rings, was, bound, the, s...","[for, within, these, rings, was, bound, the, s...",82,76
4,GALADRIEL VOICE OVER,One by one the free lands of Middle Earth fell...,Prologue: One Ring to Rule Them All...,The Fellowship of the Ring,0,"[One, by, one, the, free, lands, of, Middle, E...","[one, by, one, the, free, lands, of, middle, e...",54,48
...,...,...,...,...,...,...,...,...,...
2810,FRODO,We set out to save the Shire Sam and it has be...,The Grey Havens,The Return of the King,180,"[We, set, out, to, save, the, Shire, Sam, and,...","[we, set, out, to, save, the, shire, sam, and,...",19,13
2811,SAM,You don’t mean that. You can’t leave.,The Grey Havens,The Return of the King,180,"[You, don, ’, t, mean, that, ., You, can, ’, t...","[you, don, ’, t, mean, that, ., you, can, ’, t...",13,7
2812,FRODO,The last pages are for you Sam.,The Grey Havens,The Return of the King,180,"[The, last, pages, are, for, you, Sam, .]","[the, last, pages, are, for, you, sam, .]",8,2
2813,FRODO VOICE OVER,"My dear Sam, you cannot always be torn in two....",The Grey Havens,The Return of the King,180,"[My, dear, Sam, ,, you, can, not, always, be, ...","[my, dear, sam, ,, you, can, not, always, be, ...",47,41


## Get Book Data

In [3]:
with open("./data/books/01 - The Fellowship Of The Ring.txt", encoding='cp1250') as f:
    book1_text = f.read()

with open("./data/books/02 - The Two Towers.txt", encoding='cp1250') as f:
    book2_text = f.read()

with open("./data/books/03 - The Return Of The King.txt", encoding='cp1250') as f:
    book3_text = f.read()

books_text = book1_text + book2_text + book3_text

# Tokenize and pad the text
books_tokens = word_tokenize(books_text.lower())
books_data = list(ngrams(books_tokens, n=ngram_length))

books_data[:5]

[('three', 'rings', 'for', 'the'),
 ('rings', 'for', 'the', 'elven-kings'),
 ('for', 'the', 'elven-kings', 'under'),
 ('the', 'elven-kings', 'under', 'the'),
 ('elven-kings', 'under', 'the', 'sky')]

# Find ngrams of Script in Books

In [4]:
# make dict from ngrams
books_dict = defaultdict(list)
for i, ngram in enumerate(books_data):
    books_dict[ngram].append(i)

# remove ngrams that occur more often than 3 times (cannot have much information if they are common)
keys = list(books_dict.keys())
for key in keys:
    if len(books_dict[key]) > 3:
        del books_dict[key]

# check where ngrams of script appear in book
script["occurances"] = [{} for _ in range(len(script))]
for i, (tokenized_line, tokenized_line_lower) in enumerate(zip(script["tokenized_line"], script["tokenized_line_lower"])):

    for ii, (ngram, ngram_lower) in enumerate(zip(ngrams(tokenized_line, n=ngram_length), ngrams(tokenized_line_lower, n=ngram_length))):
        if ngram_lower in books_dict:
            script.loc[i, "occurances"][ngram] = books_dict[ngram_lower]

script["num_words_found_book"] = script["occurances"].str.len()
script

Unnamed: 0,character,text,scene,movie,scene_num,tokenized_line,tokenized_line_lower,num_words,num_words_ngrams,occurances,num_words_found_book
0,GALADRIEL VOICE OVER,I amar prestar aen . Han mathon ne nen Han ma...,Prologue: One Ring to Rule Them All...,The Fellowship of the Ring,0,"[I, amar, prestar, aen, ., Han, mathon, ne, ne...","[i, amar, prestar, aen, ., han, mathon, ne, ne...",33,27,{},0
1,GALADRIEL VOICE OVER,It began with the forging of the Great Rings.,Prologue: One Ring to Rule Them All...,The Fellowship of the Ring,0,"[It, began, with, the, forging, of, the, Great...","[it, began, with, the, forging, of, the, great...",10,4,"{('of', 'the', 'Great', 'Rings'): [25118]}",1
2,GALADRIEL VOICE OVER,"Three were given to the elves. Immortal, wise...",Prologue: One Ring to Rule Them All...,The Fellowship of the Ring,0,"[Three, were, given, to, the, elves, ., Immort...","[three, were, given, to, the, elves, ., immort...",52,46,"{('to', 'the', 'elves', '.'): [187602], ('abov...",2
3,GALADRIEL VOICE OVER,For within these rings was bound the strength ...,Prologue: One Ring to Rule Them All...,The Fellowship of the Ring,0,"[For, within, these, rings, was, bound, the, s...","[for, within, these, rings, was, bound, the, s...",82,76,"{('But', 'they', 'were', 'all'): [472195], ('t...",4
4,GALADRIEL VOICE OVER,One by one the free lands of Middle Earth fell...,Prologue: One Ring to Rule Them All...,The Fellowship of the Ring,0,"[One, by, one, the, free, lands, of, Middle, E...","[one, by, one, the, free, lands, of, middle, e...",54,48,"{('One', 'by', 'one', 'the'): [96018, 172324],...",8
...,...,...,...,...,...,...,...,...,...,...,...
2810,FRODO,We set out to save the Shire Sam and it has be...,The Grey Havens,The Return of the King,180,"[We, set, out, to, save, the, Shire, Sam, and,...","[we, set, out, to, save, the, shire, sam, and,...",19,13,"{('to', 'save', 'the', 'Shire'): [34299, 56511...",9
2811,SAM,You don’t mean that. You can’t leave.,The Grey Havens,The Return of the King,180,"[You, don, ’, t, mean, that, ., You, can, ’, t...","[you, don, ’, t, mean, that, ., you, can, ’, t...",13,7,{},0
2812,FRODO,The last pages are for you Sam.,The Grey Havens,The Return of the King,180,"[The, last, pages, are, for, you, Sam, .]","[the, last, pages, are, for, you, sam, .]",8,2,"{('last', 'pages', 'are', 'for'): [564311], ('...",2
2813,FRODO VOICE OVER,"My dear Sam, you cannot always be torn in two....",The Grey Havens,The Return of the King,180,"[My, dear, Sam, ,, you, can, not, always, be, ...","[my, dear, sam, ,, you, can, not, always, be, ...",47,41,"{('My', 'dear', 'Sam', ','): [562852], (',', '...",21


## Split into scenes

In [5]:
# cut book into chapters
chapter_starts = [0] + [i for i, x in enumerate(books_tokens) if x == "_chapter"]
chapter_titles = ["Prolog"] + re.findall(r"_Chapter\s[0-9]*_\n\s*(.*)", books_text)
books_data = pd.DataFrame([chapter_starts, chapter_titles], ["start", "name"]).T
books_data

scene_data = pd.DataFrame({"name": script["scene"].unique()})
scene_data["occurances"] = script.groupby("scene_num")["occurances"].apply(list)
scene_data["total_ngram_count"] = script.groupby("scene_num")["num_words_ngrams"].apply(sum)
scene_data

for chapter in books_data["name"]:
    scene_data[chapter] = 0

scene_data


Unnamed: 0,name,occurances,total_ngram_count,Prolog,A Long-expected Party,The Shadow of the Past,Three is Company,A Short Cut to Mushrooms,A Conspiracy Unmasked,The Old Forest,...,The Black Gate Opens,The Tower of Cirith Ungol,The Land of Shadow,Mount Doom,The Field of Cormallen,The Steward and the King,Many Partings,Homeward Bound,The Scouring of the Shire,The Grey Havens
0,Prologue: One Ring to Rule Them All...,"[{}, {('of', 'the', 'Great', 'Rings'): [25118]...",433,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Concerning Hobbits,"[{('.', 'The', 'Third', 'Age'): [532505], ('Th...",227,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Shire,"[{(',', 'Down', 'from', 'the'): [380965], ('Do...",233,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Very Old Friends,"[{}, {}, {}, {}, {}, {('good', 'to', 'see', 'y...",347,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,A Long-expected Party,"[{}, {}, {(',', 'I', 'think', 'I'): [272615], ...",292,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176,The End of All Things,"[{}, {}, {('The', 'Brandywine', 'River', '.'):...",68,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
177,The Fellowship Reunited,"[{}, {}, {}, {}, {}, {}]",-24,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
178,The Return of the King,"[{('come', 'the', 'days', 'of'): [530758], ('d...",37,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
179,Homeward Bound,"[{('the', 'Fellowship', 'of', 'the'): [10417, ...",119,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Combine connected ngrams

In [6]:
def loc_to_chapter(loc):
    """
    Matches location of ngram to book chapter
    """

    for i in range(len(books_data)-1):
        start = books_data["start"].iloc[i]
        end = books_data["start"].iloc[i+1]

        if start <= loc < end:
            return books_data["name"].iloc[i]

    if loc > books_data["start"].iloc[-1]:
        return books_data["name"].iloc[-1]

    raise ValueError("Loc not found")


def combine_tokens(token_list):
    """
    Tries to meaningfully combine tokens and removes unnecessary whitespace
    """
    text = " ".join(token_list)
    text = re.sub('\s*,\s*', ', ', text)
    text = re.sub('\s*\.\s*', '. ', text)
    text = re.sub('\s*\?\s*', '? ', text)
    text = re.sub('\s*\!\s*', '! ', text)
    return text


In [7]:
corresponding_text = pd.DataFrame({"name": script["scene"].unique()})
for chapter in books_data["name"]:
    corresponding_text[chapter] = [[] for _ in range(len(scene_data))]

for i, combined_occurances in enumerate(tqdm(scene_data["occurances"])):
    for occurances in combined_occurances:
        for ngram, occurance in occurances.items():
            ngram = list(ngram)  # tuple to list for comparison
            for loc in occurance:

                # try to combine to text fragment
                chapter_result = loc_to_chapter(loc)

                if corresponding_text.loc[i, chapter_result][-ngram_length:] == ngram:
                    # duplicate found
                    continue

                # map hits to correct scene
                scene_data.loc[i, chapter_result] += 1

                # try to reconstruct complete sentence
                prev_tokens = corresponding_text.loc[i, chapter_result][-ngram_length+1:]

                if prev_tokens == ngram[:-1]:
                    # append only 1 new token
                    corresponding_text.loc[i, chapter_result].append(ngram[-1])  
                else:
                    # seperate and append new ngram
                    corresponding_text.loc[i, chapter_result].append("<br>")  
                    for n in ngram:
                        # seperate and append new ngram (loop because weird error with pandas and lists)
                        corresponding_text.loc[i, chapter_result].append(n)


corresponding_text[chapter_titles] = corresponding_text[chapter_titles].applymap(combine_tokens)
corresponding_text


100%|██████████| 181/181 [00:03<00:00, 49.44it/s]


Unnamed: 0,name,Prolog,A Long-expected Party,The Shadow of the Past,Three is Company,A Short Cut to Mushrooms,A Conspiracy Unmasked,The Old Forest,In the House of Tom Bombadil,Fog on the Barrow-Downs,...,The Black Gate Opens,The Tower of Cirith Ungol,The Land of Shadow,Mount Doom,The Field of Cormallen,The Steward and the King,Many Partings,Homeward Bound,The Scouring of the Shire,The Grey Havens
0,Prologue: One Ring to Rule Them All...,<br>. One Ring to,"<br> It came to me <br>, my Precious.",<br> of the Great Rings <br>. One Ring to <br>...,<br> It came to me,,,,,,...,,<br> power of the Ring <br> of its own.,<br>. And the Ring,<br> slopes of Mount Doom,<br> And the Ring of,<br> the hearts of Men,,,,
1,Concerning Hobbits,"<br> The Third Age of <br> and back again, <br...","<br> of the Shire for <br>. In fact, it <br> i...",,<br> There and back again,,"<br>. In fact,",,,,...,,,,,,<br>. The Third Age of <br> all things that gr...,,,,<br> There and back again
2,The Shire,,<br> Down from the door where it began <br> An...,<br> I can. The,<br> Down from the door where it began <br> An...,<br> is going to be <br> have something to do,<br> something to do with,,,"<br> under the Hill,",...,,<br> going to be a,,,,,<br> from the door where it began <br> road go...,,"<br>, if it comes",<br> is going to be <br> very much as it
3,Very Old Friends,,<br>. One hundred and <br> hanging on the bell...,,,,"<br>. In fact, I",,,,...,,,,,,,"<br>, do you?","<br>, come in!",,
4,A Long-expected Party,<br> for a thousand years,"<br> they were going to <br>, I am. <br> dear ...","<br>. I think it <br>, Bracegirdles, and <br> ...",<br> over the top of,<br> but it was n't,<br> as well as I,,,<br> one by one and,...,,<br> I 'll just have,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176,The End of All Things,,,,<br> The Brandywine River.,,,,,,...,,,,<br> here at the end <br> the end of all things,<br> the end of all things,,,,,
177,The Fellowship Reunited,,,,,,,,,,...,,,,,,,,,,
178,The Return of the King,,,,,,,,,,...,,,,,,<br> come the days of <br> days of the King <b...,,,,
179,Homeward Bound,<br> the Fellowship of the Ring <br> and Back ...,,,<br> There and Back Again,,,,,,...,,"<br> the Ring, though",,,,,<br> the Fellowship of the Ring,,,<br> There and Back Again <br> Lord of the Rings


## Plot Heatmaps of Movies

## Plot for each movie

In [8]:
# num_scenes = len(scene_data)
# num_chapters = len(chapter_titles)

# div = ""
# for movie in script["movie"].unique():
#     scenes = script[script["movie"] == movie]["scene"].unique()
#     corresponding_text_filtered = corresponding_text[corresponding_text["name"].isin(
#         scenes)].copy()

#     x = scenes
#     y = chapter_titles
#     z = corresponding_text_filtered[chapter_titles].T.values

#     hovertext = list()
#     for yi, yy in enumerate(y):
#         hovertext.append(list())
#         for xi, xx in enumerate(x):
#             hovertext[-1].append(
#                 'Movie Scene: {}<br />Book Chapter: {}<br />Overlapping Phrases: {}'.format(xx, yy, z[yi][xi]))

#     fig = go.Figure(data=go.Heatmap(
#         # scene_data[chapter_titles].T,
#         z=scene_data[scene_data["name"].isin(scenes)][chapter_titles].T,
#         # y="Book Chapter", x="Movie Scene", color=f"Nr of copied lines of length {ngram_length}",
#         y=chapter_titles,
#         x=scenes,
#         hoverinfo="text",
#         text=hovertext,
#         colorscale='Greens',
#         showscale=False
#         # colorbar=dict(title='Number of overlapping phrases')
#     ))
#     fig.update_layout(
#         title=f'<span style="font-size: 25px>{movie}</span>',
#         xaxis_title="Movie Scenes",
#         yaxis_title="Book Chapters",
#         yaxis_nticks=len(chapter_titles),
#         xaxis_nticks=len(scenes),
#         font_size=9,
#         autosize=False,
#         width=1500,
#         height=700
#     )
#     fig.update_coloraxes(showscale=False)

#     # fig.show()
#     div += opy.plot(fig, auto_open=False, output_type='div')

# with open(f"div_file", "w") as f:
#     f.write(div)

## Combined Plot

In [23]:
scenes = script["scene"].unique()

x = scenes
y = chapter_titles
z = corresponding_text[chapter_titles].T.values

hovertext = list()
for yi, yy in enumerate(y):
    hovertext.append(list())
    for xi, xx in enumerate(x):
        hovertext[-1].append(
            'Movie Scene: {}<br />Book Chapter: {}<br />Overlapping Phrases: {}'.format(xx, yy, z[yi][xi]))

fig = go.Figure(data=go.Heatmap(
    z=scene_data[chapter_titles],
    x=chapter_titles,
    y=scenes,
    hoverinfo="text",
    text=hovertext,
    colorscale='Greens',
    showscale=False
))

fig.update_layout(
    yaxis_title="Movie Scenes",
    xaxis_title="Book Chapters",
    font_size=15,
    autosize=False,
    width=1000,
    height=2000,
)

fig.update_coloraxes(showscale=False)
fig["layout"]["xaxis"].update(side="top")
fig.update_xaxes(tickangle=45)
fig.show()

with open(f"div_file", "w") as f:
    f.write(opy.plot(fig, auto_open=False, output_type='div'))