## IRTM Outlander Analysis
By Simona Vychytilova and Meike Thijsen

In [174]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import RegexpTokenizer
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
import re

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\meike\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\meike\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\meike\AppData\Roaming\nltk_data...


### Preprocessing

#### Bookwise preprocessing

In [61]:
books = ['Outlander', 'Dragonfly in Amber', 'Voyager', 'Drums of Autumn', 'The Fiery Cross', 'A Breath of Snow and Ashes',
         'An Echo in the Bone', 'Written in My Own Heart’s Blood']
extras = ['Other Books by this Author', 'About the Author']
bookstarts = [50, 17287, 37378, 61857, 89432, 119494, 152540, 177800, 202059]

In [129]:
f = open("data.txt", "r", encoding="utf8")
booknum = 1


book = ''
i = 0
for line in f:
    if i < bookstarts[booknum] and i > bookstarts[booknum-1]:
        book = book + line
    i = i + 1

In [142]:
# we want to segment the data into chapters as we want to see the topic per chapter. But how do we do that?
# the author devided the books into parts, this is how we can see when hte first chapter begins, then we search for a line
# that is written in all caps, after this the chapter starts: example: \n\n\n\n\n\n1\n\nA NEW BEGINNING\n\n
# 
# book 1 has 41, 49, 63, 71, 111, 124, 103, 145
# found is 41, 49, 63, 71, 111, 124, 0, 0 
def split_into_chapters(booknum, text):
    if booknum <7:
        return re.split(r'\n[0-9]+\n\n', book)
    else:
        # needs refinement
        return re.split(r'[A-Z ]+\n\n', book)

In [155]:
def split_into_paragraphs(text):
    splitted = text.split('\n')
    return [x for x in splitted if x != '']

In [156]:
print(split_into_paragraphs(split_into_chapters(1, book)[4]))


['I COME TO THE CASTLE', 'The rest of the journey passed uneventfully, if you consider it uneventful to ride fifteen miles on horseback through rough country at night, frequently without benefit of roads, in company with kilted men armed to the teeth, and sharing a horse with a wounded man. At least we were not set upon by highwaymen, we encountered no wild beasts, and it didn’t rain. By the standards I was becoming used to, it was quite dull.', 'Dawn was coming up in streaks and slashes over the foggy moor. Our destination loomed ahead, a huge bulk of dark stone outlined by the grey light.', 'The surroundings were no longer quiet and deserted. There was a trickle of rudely dressed people, heading toward the castle. They moved to the side of the narrow road to let the horses trot past, gawking at what they plainly thought my outlandish garb.', 'Not surprisingly, it was misting heavily, but there was enough light to show a stone bridge, arching over a small stream that ran past the fron

#### Linguistic Preprocessing

In [170]:
# tokenizer
per_paragraph = True

tokenizer = RegexpTokenizer(r"[a-zA-Z’]+")
chapters = split_into_chapters(booknum, book)
tchapters = []
for chapter in chapters:
    tparagraphs = []
    if per_paragraph:
        paragraphs = split_into_paragraphs(chapter)
        for paragraph in paragraphs:
            tparagraphs.append(tokenizer.tokenize(paragraph))
        tchapters.append(tparagraphs)
    else:
        tchapters.append(tokenizer.tokenize(chapter))

In [177]:
tchapters[0]

['Dragonfly',
 'in',
 'Amber',
 'Voyager',
 'Drums',
 'of',
 'Autumn',
 'The',
 'Fiery',
 'Cross',
 'A',
 'Breath',
 'of',
 'Snow',
 'and',
 'Ashes',
 'An',
 'Echo',
 'in',
 'the',
 'Bone',
 'Written',
 'in',
 'My',
 'Own',
 'Heart’s',
 'Blood',
 'Other',
 'Books',
 'by',
 'this',
 'Author',
 'About',
 'the',
 'Author',
 'OUTLANDER',
 'A',
 'Delta',
 'Book',
 'PUBLISHING',
 'HISTORY',
 'Delacorte',
 'Press',
 'hardcover',
 'edition',
 'published',
 'Delta',
 'trade',
 'paperback',
 'edition',
 'July',
 'Published',
 'by',
 'Bantam',
 'Dell',
 'A',
 'Division',
 'of',
 'Random',
 'House',
 'Inc',
 'New',
 'York',
 'New',
 'York',
 'All',
 'rights',
 'reserved',
 'Copyright',
 'by',
 'Diana',
 'Gabaldon',
 'Title',
 'page',
 'art',
 'copyright',
 'by',
 'Barbara',
 'Schnell',
 'Library',
 'of',
 'Congress',
 'Catalog',
 'Card',
 'Number',
 'No',
 'part',
 'of',
 'this',
 'book',
 'may',
 'be',
 'reproduced',
 'or',
 'transmitted',
 'in',
 'any',
 'form',
 'or',
 'by',
 'any',
 'means',
 

In [181]:
lemmatizer=WordNetLemmatizer()
for x in tchapters[0]:
    print(lemmatizer.lemmatize(x))
    
    

Dragonfly
in
Amber
Voyager
Drums
of
Autumn
The
Fiery
Cross
A
Breath
of
Snow
and
Ashes
An
Echo
in
the
Bone
Written
in
My
Own
Heart’s
Blood
Other
Books
by
this
Author
About
the
Author
OUTLANDER
A
Delta
Book
PUBLISHING
HISTORY
Delacorte
Press
hardcover
edition
published
Delta
trade
paperback
edition
July
Published
by
Bantam
Dell
A
Division
of
Random
House
Inc
New
York
New
York
All
right
reserved
Copyright
by
Diana
Gabaldon
Title
page
art
copyright
by
Barbara
Schnell
Library
of
Congress
Catalog
Card
Number
No
part
of
this
book
may
be
reproduced
or
transmitted
in
any
form
or
by
any
mean
electronic
or
mechanical
including
photocopying
recording
or
by
any
information
storage
and
retrieval
system
without
the
written
permission
of
the
publisher
except
where
permitted
by
law
Delta
is
a
registered
trademark
of
Random
House
Inc
and
the
colophon
is
a
trademark
of
Random
House
Inc
Please
visit
our
website
at
www
bantamdell
com
eISBN
v
r
Contents
Master
Table
of
Contents
Outlander
Title
page
Copyrigh