# Imports

In [1]:
from collections import Counter
import os
import os.path as osp
import re

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import tqdm

from utils.config import config

# 1. Load data

In [2]:
fpath = osp.join(config.data_dir, "task5", "alice.txt")
with open(fpath) as fin:
    text = fin.read()
text[:20]

'CHAPTER I.\nDown the '

In [3]:
chapters = re.split("CHAPTER", text)
for chapter in chapters:
    print(chapter[:15])


 I.
Down the Ra
 II.
The Pool o
 III.
A Caucus-
 IV.
The Rabbit
 V.
Advice from
 VI.
Pig and Pe
 VII.
A Mad Tea
 VIII.
The Quee
 IX.
The Mock T
 X.
The Lobster
 XI.
Who Stole 
 XII.
Alice’s E


# 2. Preprocessing

In [None]:
!python -m spacy download en_core_web_sm

In [5]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(text.lower())

In [6]:
for token in doc[:10]:
    print(token.text, token.is_alpha, token.is_stop, token.lemma_, token.pos_)

chapter True False chapter NOUN
i. False False i. NOUN

 False False 
 SPACE
down True True down ADP
the True True the DET
rabbit True False rabbit NOUN
- False False - PUNCT
hole True False hole NOUN



 False False 


 SPACE
alice True False alice NOUN


# 3. Find 10 most important words in each chapter

In [7]:
nlp = spacy.load("en_core_web_sm")
tokenized_chapters = []
for chapter in tqdm.tqdm(chapters):
    doc = nlp(chapter)
    tokens = ' '.join([token.lemma_.lower() for token in doc if (token.is_alpha and not token.is_stop and not token.text == "Alice")])
    tokenized_chapters.append(tokens)

100%|██████████| 13/13 [00:05<00:00,  2.33it/s]


In [8]:
for tokenized_chapter in tokenized_chapters:
    print(tokenized_chapter[:10])


rabbit hol
ii pool te
iii caucus
iv rabbit 
advice cat
vi pig pep
vii mad te
viii queen
ix mock tu
lobster qu
xi steal t
xii eviden


In [9]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(tokenized_chapters[1:])
X = X.toarray()
X.shape

(12, 1761)

In [10]:
feature_names = np.asarray(vectorizer.get_feature_names())
for chapter_idx in range(X.shape[0]):
    top_words = np.argsort(X[chapter_idx])[::-1][:10]
    print(chapter_idx+1, feature_names[top_words])

1 ['fall' 'eat' 'think' 'little' 'bat' 'door' 'rabbit' 'key' 'go' 'way']
2 ['mouse' 'pool' 'little' 'say' 'oh' 'swam' 'cat' 'think' 'dear' 'cry']
3 ['say' 'mouse' 'dodo' 'prize' 'race' 'lory' 'dry' 'know' 'thimble' 'bird']
4 ['bill' 'little' 'window' 'rabbit' 'puppy' 'grow' 'glove' 'fan' 'say'
 'bottle']
5 ['caterpillar' 'say' 'serpent' 'pigeon' 'youth' 'egg' 'size' 'think'
 'father' 'little']
6 ['say' 'footman' 'cat' 'baby' 'mad' 'duchess' 'grin' 'wow' 'think' 'go']
7 ['hatter' 'dormouse' 'say' 'hare' 'march' 'tea' 'twinkle' 'time' 'know'
 'go']
8 ['queen' 'say' 'hedgehog' 'king' 'gardener' 'go' 'look' 'soldier' 'cat'
 'procession']
9 ['turtle' 'say' 'mock' 'gryphon' 'duchess' 'moral' 'queen' 'go' 'think'
 'school']
10 ['turtle' 'mock' 'gryphon' 'say' 'dance' 'lobster' 'soup' 'beautiful'
 'whiting' 'soo']
11 ['king' 'hatter' 'say' 'court' 'dormouse' 'witness' 'jury' 'queen'
 'officer' 'juror']
12 ['say' 'king' 'jury' 'dream' 'write' 'queen' 'sister' 'slate' 'rabbit'
 'juryman']


Let's name chapters after 10 most important words:

1. Little thinking during the fall
2. Mouse says "cat" and cries
3. Dry race with mouse and dodo
4. Bill for rabbit's puppy
5. Talk with caterpillar
6. Dutchess mad with baby
7. Tea with hatter and march hare
8. Procession of the queen
9. Moral talks with mock turtle and gryphon
10. Lobster dance in soup
11. King witness hatter in court
12. The dream of queen's court

# 4. Find the Top 10 most used verbs in sentences with Alice. What does Alice do most often?

In [11]:
doc = nlp(text)

In [12]:
verbs = []

for sent in doc.sents:
    get_verbs = False
    for token in sent:
        if token.text.lower() == "alice":
            get_verbs = True
            break
            
    if get_verbs:
        for token in sent:
            if (token.pos_ == "VERB") and token.is_alpha and (not token.is_stop):
                verbs.append(token.lemma_.lower())
                
counter = Counter(verbs)
counter.most_common(10)

[('say', 174),
 ('think', 81),
 ('go', 47),
 ('look', 44),
 ('know', 36),
 ('begin', 34),
 ('come', 31),
 ('get', 26),
 ('feel', 25),
 ('find', 23)]

Most of the time Alice says, thinks, knows, or feels something. Sometimes she acts - goes and comes anywhere, looks at something or finds something.