In [None]:
!pip install spacy

In [None]:
pip install pytextrank

In [None]:
!python -m spacy download en_core_web_sm

In [4]:
import spacy
from spacy.matcher import Matcher
from collections import Counter

from icecream import ic

import pytextrank

import re

import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel

import numpy as np

import pyLDAvis
import pyLDAvis.gensim_models

In [5]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [6]:
nlp = spacy.load("en_core_web_sm")

In [None]:
nlp.add_pipe("textrank", config={ "stopwords": { "word": ["NOUN"] } })

In [8]:
# Open the file
with open("PsycOfTheUncon.txt", "r", encoding="utf8") as f:
    text = f.read()

In [9]:
# Clean the text
text = text.replace('\n', ' ')
text = text.replace('_',' ')
text = re.sub("\[\d+\]", '', text)

In [10]:
# add some additional stop words
my_stop_words = ['mr', 'Mr', 'Miss', 'great', 'like']

for stopword in my_stop_words:
    lexeme = nlp.vocab[stopword]
    lexeme.is_stop = True

In [11]:
doc = nlp(text)

# Look at entities in the text

In [None]:
# arbitrarily looked into entities in the text in hope of finding anything interesting
for ent in doc.ents:
    if ent.label_ != 'DATE' and ent.label_ != 'CARDINAL' and ent.label_ != 'TIME':
        print(ent.text, ent.label_)

In [None]:
# noticed many 'Christian's, therefore decided to go deeper into a label 'NORP'
for ent in doc.ents:
    if ent.label_ == 'NORP':
        print(ent.text, ent.label_)

In [14]:
# fetch the 10 most common 'NORP' words
norps = [token.text
         for token in doc.ents
         if (token.label_ == 'NORP')]
norp_freq = Counter(norps)

common_norps = norp_freq.most_common(10)
common_norps

[('Christian', 78),
 ('Egyptian', 32),
 ('Christianity', 23),
 ('Indian', 19),
 ('Latin', 19),
 ('German', 18),
 ('Greek', 17),
 ('Freudian', 11),
 ('Roman', 9),
 ('Eleusinian', 9)]

In [15]:
# mini-conclusion: the text probably has a lot to do with Christianity and ancient civilizations, as may conclude from words such as 
#'Egyptian', 'Indian', 'Greek', 'Roman', 'Eleusinian'

# Try Summarising

In [16]:
# try summarising the text using PyTextRank 

# seperate the text by chapter(13 in total, introduction included)
texts, article = [], ""

for word in doc:
    article = article + " " + word.text
    if word.text == 'CHAPTER':
        texts.append(article)
        article = ""
texts.append(article)

In [17]:
# look at top-ranked phrases in each chapter
doclist = []

for subdoc in range(len(texts)):
    dl = nlp(texts[subdoc])
    doclist.append(dl)
    
for subdoc in doclist: 
    for p in subdoc._.phrases[:10]:
        print('{:.4f} {:5d}  {}'.format(p.rank, p.count, p.text))
        print(p.chunks)
    print("--------------------------------")

0.0743     1  individual psychologic problems
[individual psychologic problems]
0.0632     1  historical material
[historical material]
0.0631     1  historical problems
[historical problems]
0.0627     1  individual psychology
[individual psychology]
0.0616     1  specific historical material
[specific historical material]
0.0512     5  Freud
[Freud, Freud, Freud, Freud, Freud]
0.0508     1  Oedipus
[Oedipus]
0.0504     1  unconscious work
[unconscious work]
0.0471     1  new light
[new light]
0.0460     1  psychoanalytic knowledge
[psychoanalytic knowledge]
--------------------------------
0.0578     9  directed thinking
[directed thinking, directed thinking, directed thinking, directed thinking, directed thinking, directed thinking, directed thinking, directed thinking, directed thinking]
0.0559     1  dream psychology
[dream psychology]
0.0550     1  Dreams
[Dreams]
0.0550    13  dreams
[dreams, dreams, dreams, dreams, dreams, dreams, dreams, dreams, dreams, dreams, dreams, dreams,

In [18]:
# summarise each chapter by yielding each top 3 sentences
for subdoc in doclist:
    tr = subdoc._.textrank
    for sent in tr.summary(limit_phrases=15, limit_sentences=3):
        ic(sent)
    ic("--------------------------------")

ic| sent: The leading purpose of these works is the unlocking of historical problems through the application of psychoanalytic knowledge ; that is to say , knowledge drawn from the activity of the modern unconscious mind concerning specific historical material .
ic| sent: For , just as the psychoanalytic conceptions promote understanding of the historic psychologic creations , so reversedly historical materials can shed new light upon individual psychologic problems .
ic| sent: It seems to me , however , that in the present state of affairs there is a more or less imperative demand for the psychoanalyst to broaden the analysis of the individual problems by a comparative study of historical material relating to them , just as Freud has already done in a masterly manner in his book on “ Leonardo da Vinci . ”
ic| '--------------------------------'
ic| sent: While directed thinking is a phenomenon conscious throughout , the same can not be asserted of phantastic thinking .
ic| sent: By mea

In [19]:
# mini-conclusion: as summarised in the top 3 sentences of the introduction part of the book(the first 3 sentences), the overall text 
# seems to focus on examining the relation between historical materials and the psychology of an individual, and probably how they have been 
# influenced by each other since the ancient time, which may explains the top 'NORP' words from the previous section. The text seems to discuss 
# in details about religions and ancient myths, as well as their symbolic meanings, especially with regards to libido and Mother. 

# Try Topic Modeling

In [20]:
# try extracting topics using gensim and LDA model

# seperate the text by chapter
texts, article = [], []

for word in doc:
    if word.pos_ != 'SPACE' and word.pos_ != 'VERB' and not word.is_stop and not word.is_punct and not word.like_num:
        article.append(word.lemma_)
        
    if word.text == 'CHAPTER':
        texts.append(article)
        article = []
texts.append(article)

In [21]:
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [22]:
lda_model = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)
lda_model.show_topics()

[(0,
  '0.010*"mother" + 0.008*"libido" + 0.004*"world" + 0.004*"man" + 0.004*"sun" + 0.004*"symbol" + 0.004*"death" + 0.004*"god" + 0.004*"life" + 0.004*"time"'),
 (1,
  '0.017*"mother" + 0.010*"libido" + 0.006*"life" + 0.005*"symbol" + 0.005*"sun" + 0.005*"world" + 0.005*"man" + 0.004*"death" + 0.004*"God" + 0.004*"god"'),
 (2,
  '0.008*"mother" + 0.007*"libido" + 0.005*"world" + 0.005*"symbol" + 0.004*"man" + 0.004*"sun" + 0.004*"God" + 0.004*"time" + 0.004*"fire" + 0.004*"life"'),
 (3,
  '0.010*"libido" + 0.010*"mother" + 0.006*"sun" + 0.005*"man" + 0.005*"life" + 0.004*"God" + 0.004*"hero" + 0.004*"world" + 0.004*"sexual" + 0.003*"time"'),
 (4,
  '0.013*"mother" + 0.006*"libido" + 0.005*"man" + 0.005*"world" + 0.004*"life" + 0.004*"time" + 0.004*"death" + 0.004*"tree" + 0.004*"God" + 0.004*"hero"'),
 (5,
  '0.011*"libido" + 0.011*"mother" + 0.006*"man" + 0.006*"world" + 0.005*"life" + 0.004*"time" + 0.004*"hero" + 0.004*"God" + 0.004*"symbol" + 0.004*"god"'),
 (6,
  '0.009*"mother

In [23]:
# visualize the result using pyLDAvis
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

In [24]:
# Try Bigram
bigram = gensim.models.Phrases(texts, min_count=5, threshold=100)
textsbi = [bigram[line] for line in texts]

In [25]:
dictionarybi = Dictionary(textsbi)
corpusbi = [dictionarybi.doc2bow(text) for text in textsbi]

In [None]:
lda_model_bi = LdaModel(corpus=corpusbi, num_topics=10, id2word=dictionarybi)
lda_model_bi.show_topics()

In [27]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(lda_model_bi, corpusbi, dictionarybi)

In [28]:
# mini-conclusion: there aren't many new information, each topic is apparently almost identical, high proportion of 'mother' and 'libido'.

# A bit deeper with 'mother'

In [29]:
# from the previous topic modeling section, the word 'mother' appeared to hold a fair amount of importance, the word probably 
# has some significant connections to different aspects within the text.

# try to examine the contexts in which the word 'mother' appeared
matcher = Matcher(nlp.vocab)
pattern = [
    {'TEXT': {'REGEX': '\w+'}, 'OP': '*'},
    {'TEXT': 'mother', "OP": "+"}, 
    {'TEXT': {'REGEX': '\w+'}, 'OP': '*'}
    ]
matcher.add("matching", [pattern], greedy="LONGEST")
matches = matcher(doc)
matches.sort(key = lambda x: x[1])

for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

(1221037237276548748, 639, 646) that the mother can be the all
(1221037237276548748, 1724, 1733) We know the dream of the mother of Augustus
(1221037237276548748, 6526, 6534) is the mother of the modern scientific attitude
(1221037237276548748, 9278, 9291) as in Grimm’s Fairy Tales which the mother tells to her children
(1221037237276548748, 15419, 15431) It seemed to me as if my mother’s voice wakened me
(1221037237276548748, 18340, 18347) read aloud to me by my mother
(1221037237276548748, 18702, 18706) mother of uncounted peoples
(1221037237276548748, 21613, 21630) it seems to be the influence of the mother transference which creates the attributes of the divinity
(1221037237276548748, 30151, 30172) At the very first and in foremost position it was father and mother who were the objects of the childish love
(1221037237276548748, 30212, 30219) mother imago is organized into a system


In [30]:
# from above 'mother's were used for different meanings, for instance in the context of a mother of a human child, the mother earth, 
# mothers in symbolism, mothers in myths.

# look for names of mothers
pattern = r"mother [A-Z]\w+"
matches = re.finditer(pattern, text)
for match in matches:
    print(match)

<re.Match object; span=(190660, 190670), match='mother Eve'>
<re.Match object; span=(317947, 317959), match='mother Earth'>
<re.Match object; span=(416042, 416053), match='mother Isis'>
<re.Match object; span=(418757, 418768), match='mother Isis'>
<re.Match object; span=(570407, 570421), match='mother Wenonah'>
<re.Match object; span=(713652, 713665), match='mother Tiâmat'>


In [31]:
# mini-conclusion: the multiplicity of interpretations of the word may be one of the factor that contribute to the high frequency of 'mother'