# Getting started

In [3]:
import nltk
import re
import string

In [7]:
with open("dataset/processed/stories/1_the_adventures_of_sherlock_holmes/a_scandal_in_bohemia.txt") as fp:
    data = fp.read()

# clean processed input further
# todo add these steps to our preprocessing step
data = data.replace("\n", " ")
data = re.sub(r" +", " ", data)

## tokenize

In [8]:
# todo recognize quotes as direct speech
# todo sherlock holmes is split into two tokens
sent = nltk.tokenize.sent_tokenize(data)

In [12]:
sentence_id = 3
words = nltk.tokenize.word_tokenize(sent[sentence_id])

#print([word for word in words if word not in string.punctuation])
print(sent[sentence_id])
print(words)

It was not that he felt any emotion akin to love for Irene Adler.
['It', 'was', 'not', 'that', 'he', 'felt', 'any', 'emotion', 'akin', 'to', 'love', 'for', 'Irene', 'Adler', '.']


## pos tagger

In [13]:
#nltk.help.upenn_tagset()
tagged = nltk.pos_tag(words)

In [14]:
tagged

[('It', 'PRP'),
 ('was', 'VBD'),
 ('not', 'RB'),
 ('that', 'IN'),
 ('he', 'PRP'),
 ('felt', 'VBD'),
 ('any', 'DT'),
 ('emotion', 'NN'),
 ('akin', 'NN'),
 ('to', 'TO'),
 ('love', 'VB'),
 ('for', 'IN'),
 ('Irene', 'NNP'),
 ('Adler', 'NNP'),
 ('.', '.')]

## named entity recognition

In [29]:
entities = nltk.chunk.ne_chunk(tagged)
print(entities)

(S
  It/PRP
  was/VBD
  not/RB
  that/IN
  he/PRP
  felt/VBD
  any/DT
  emotion/NN
  akin/NN
  to/TO
  love/VB
  for/IN
  (PERSON Irene/NNP Adler/NNP)
  ./.)


In [30]:
# https://stackoverflow.com/questions/44237087/ne-chunk-without-pos-tag-in-nltk
simple = []
for elt in entities:
    if isinstance(elt, nltk.Tree):
        simple.append(nltk.Tree(elt.label(), [ word for word, tag in elt ]))
    else:
        simple.append( elt[0] )
        
simple        

['It',
 'was',
 'not',
 'that',
 'he',
 'felt',
 'any',
 'emotion',
 'akin',
 'to',
 'love',
 'for',
 Tree('PERSON', ['Irene', 'Adler']),
 '.']

## tf-idf

## vocabulary

In [41]:
# todo stemming
# todo lemmatization

# rudimentary version
wordlist = nltk.word_tokenize(data)
unique_words = set(wordlist)

print(f"{len(wordlist)} total words contain {len(unique_words)} unique words.")

51676 total words contain 6232 unique words.


In [42]:
# slightly better
# https://programminghistorian.org/en/lessons/counting-frequencies

wordfreq = [wordlist.count(p) for p in wordlist]
freqdict = dict(list(zip(wordlist,wordfreq)))

In [45]:
aux = [(freqdict[key], key) for key in freqdict]
aux.sort()
aux.reverse()

In [51]:
# frequent words / tokens
# todo need cleaning
# todo need stopword removal
# todo need merging of words: stemming, lemmatization...
aux[:100]

[(2959, ','),
 (2406, '.'),
 (2327, 'the'),
 (1322, 'and'),
 (1204, 'of'),
 (1076, 'to'),
 (963, 'a'),
 (940, 'I'),
 (886, '``'),
 (811, "''"),
 (674, 'in'),
 (649, 'was'),
 (630, 'he'),
 (619, 'that'),
 (613, 'his'),
 (471, 'had'),
 (454, 'it'),
 (369, 'you'),
 (315, 'which'),
 (313, 'with'),
 (303, 'for'),
 (300, 'as'),
 (291, 'is'),
 (289, 'at'),
 (278, 'have'),
 (274, 'my'),
 (271, 'him'),
 (249, 'be'),
 (212, 'me'),
 (208, 'The'),
 (208, '?'),
 (207, 'said'),
 (195, 'upon'),
 (193, "'s"),
 (192, 'on'),
 (179, 'this'),
 (179, 'not'),
 (175, 'from'),
 (173, 'but'),
 (173, 'He'),
 (172, 'all'),
 (169, 'were'),
 (164, 'her'),
 (160, 'there'),
 (155, 'man'),
 (151, 'by'),
 (149, 'one'),
 (147, 'been'),
 (142, 'we'),
 (142, 'them'),
 (142, 'no'),
 (139, 'so'),
 (139, 'It'),
 (135, "'"),
 (133, 'are'),
 (132, 'they'),
 (130, 'up'),
 (128, 'an'),
 (125, '--'),
 (119, 'out'),
 (118, 'would'),
 (115, 'or'),
 (112, 'who'),
 (107, 'do'),
 (104, ';'),
 (101, 'some'),
 (100, 'when'),
 (99, 'int