## IRTM Outlander Analysis
By Simona Vychytilova and Meike Thijsen



In [36]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import RegexpTokenizer
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
import re

import numpy as np
import pandas as pd
import os
nltk.download('averaged_perceptron_tagger')
from nltk import sent_tokenize
from nltk import defaultdict
nltk.download('maxent_ne_chunker')
nltk.download('words')
from nltk.chunk import ne_chunk

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\meike\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\meike\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\meike\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### Preprocessing

#### Bookwise preprocessing

In [37]:
nltk.__version__

'3.4.4'

In [38]:
books = ['Outlander', 'Dragonfly in Amber', 'Voyager', 'Drums of Autumn', 'The Fiery Cross', 'A Breath of Snow and Ashes',
         'An Echo in the Bone', 'Written in My Own Heart’s Blood']
extras = ['Other Books by this Author', 'About the Author']
bookstarts = [50, 17287, 37378, 61857, 89432, 119494, 152540, 177800, 202059]

In [39]:
f = open("data.txt", "r", encoding="utf8")
booknum = 1


book = ''
i = 0
for line in f:
    if i < bookstarts[booknum] and i > bookstarts[booknum-1]:
        book = book + line
    i = i + 1

In [40]:
# we want to segment the data into chapters as we want to see the topic per chapter. But how do we do that?
# the author devided the books into parts, this is how we can see when hte first chapter begins, then we search for a line
# that is written in all caps, after this the chapter starts: example: \n\n\n\n\n\n1\n\nA NEW BEGINNING\n\n
# 
# book 1 has 41, 49, 63, 71, 111, 124, 103, 145
# found is 41, 49, 63, 71, 111, 124, 0, 0 
def split_into_chapters(booknum, text):
    if booknum <7:
        return re.split(r'\n[0-9]+\n\n', book)
    else:
        # needs refinement
        return re.split(r'[A-Z ]+\n\n', book)

In [41]:
def split_into_paragraphs(text):
    splitted = text.split('\n')
    return [x for x in splitted if x != '']

In [42]:
# print(split_into_paragraphs(split_into_chapters(1, book)[4]))
sentences = split_into_paragraphs(split_into_chapters(1, book)[2])
sentences


['STANDING STONES',
 'Mr. Crook called for me, as arranged, promptly at seven the next morning.',
 '“So as we’ll catch the dew on the buttercups, eh, lass?” he said, twinkling with elderly gallantry. He had brought a motorcycle of his own approximate vintage, on which to transport us into the countryside. The plant presses were tidily strapped to the sides of this enormous machine, like bumpers on a tugboat. It was a leisurely ramble through the quiet countryside, made all the more quiet by contrast with the thunderous roar of Mr. Crook’s cycle, suddenly throttled into silence. The old man did indeed know a lot about the local plants, I discovered. Not only where they were to be found but their medicinal uses, and how to prepare them. I wished I had brought a notebook to get it all down, but listened intently to the cracked old voice, and did my best to commit the information to memory as I stowed our specimens in the heavy plant presses.',
 'We stopped for a packed luncheon near the b

#### Linguistic Preprocessing

In [53]:
# tokenizer
per_paragraph = True

tokenizer = RegexpTokenizer(r"[a-zA-Z’]+")
chapters = split_into_chapters(booknum, book)
tchapters = []
for chapter in chapters:
    tparagraphs = []
    if per_paragraph:
        paragraphs = split_into_paragraphs(chapter)
        for paragraph in paragraphs:
            tparagraphs.append(tokenizer.tokenize(paragraph))
        tchapters.append(tparagraphs)
    else:
        tchapters.append(tokenizer.tokenize(chapter))

In [57]:
chapternum = 0
lemmatizer=WordNetLemmatizer()
if not per_paragraph:
    for word in tchapters[chapternum]:
        print(lemmatizer.lemmatize(word))
else:
    for paragraph in tchapters[chapternum]:
        for word in paragraph:
            print(lemmatizer.lemmatize(word))

Dragonfly
in
Amber
Voyager
Drums
of
Autumn
The
Fiery
Cross
A
Breath
of
Snow
and
Ashes
An
Echo
in
the
Bone
Written
in
My
Own
Heart’s
Blood
Other
Books
by
this
Author
About
the
Author
OUTLANDER
A
Delta
Book
PUBLISHING
HISTORY
Delacorte
Press
hardcover
edition
published
Delta
trade
paperback
edition
July
Published
by
Bantam
Dell
A
Division
of
Random
House
Inc
New
York
New
York
All
right
reserved
Copyright
by
Diana
Gabaldon
Title
page
art
copyright
by
Barbara
Schnell
Library
of
Congress
Catalog
Card
Number
No
part
of
this
book
may
be
reproduced
or
transmitted
in
any
form
or
by
any
mean
electronic
or
mechanical
including
photocopying
recording
or
by
any
information
storage
and
retrieval
system
without
the
written
permission
of
the
publisher
except
where
permitted
by
law
Delta
is
a
registered
trademark
of
Random
House
Inc
and
the
colophon
is
a
trademark
of
Random
House
Inc
Please
visit
our
website
at
www
bantamdell
com
eISBN
v
r
Contents
Master
Table
of
Contents
Outlander
Title
page
Copyrigh

In [85]:
def chunk_sentences(sentences):
    # Tokenize each sentence into words: token_sentences
    token_sentences = [word_tokenize(sent) for sent in sentences]

    # Tag each tokenized sentence into parts of speech: pos_sentences
    pos_sentences = [nltk.pos_tag(sent) for sent in token_sentences]

    # Create the named entity chunks: chunked_sentences
    chunked_sentences = nltk.ne_chunk_sents(pos_sentences, binary=False)
    
    return chunked_sentences

In [92]:
def count_chunks(chunked_sentences):
    person = {}
    gpe = {}
    organization = {}
    location = {}

    # Test for stems of the tree with 'NE' tags
    for sent in chunked_sentences:
        for chunk in sent:
            if hasattr(chunk, "label"):
                if chunk.label() == "PERSON":
                    if chunk.leaves()[0][0] in person:
                        person[chunk.leaves()[0][0]] += 1
                    else:
                        person[chunk.leaves()[0][0]] = 1
                elif chunk.label() == "GPE":
                    if chunk.leaves()[0][0] in gpe:
                        gpe[chunk.leaves()[0][0]] += 1
                    else:
                        gpe[chunk.leaves()[0][0]] = 1
                elif chunk.label() == 'ORGANIZATION':
                    if chunk.leaves()[0][0] in organization:
                        organization[chunk.leaves()[0][0]] += 1
                    else:
                        organization[chunk.leaves()[0][0]] = 1
                elif chunk.label() == 'LOCATION':
                    if chunk.leaves()[0][0] in location:
                        location[chunk.leaves()[0][0]] += 1
                    else:
                        location[chunk.leaves()[0][0]] = 1
                else:
                    print(chunk.label())
    return person, gpe, organization, location

In [93]:
chunked_sentences = chunk_sentences(sentences)
person, gpe, organization, location = count_chunks(chunked_sentences)