In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
with open('CoRef-TheDaVinciCode.txt', 'r') as f:
    data = f.read()
    data = data.replace("\n", " ")
    data = data.replace("\t", " ")
    data = data.replace("\r", " ")
    data = data.replace("  ", " ")
    data = data.replace("   ", " ")
    data = data.replace("    ", " ")
    data = data.replace("     ", " ")
    ' '.join(data.split())
    data = re.sub('/\s+/', ' ', data)
f.close()

In [3]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [5]:
# Remove new line characters
data = re.sub('\s+', ' ', data)

# Remove distracting single quotes
data = re.sub("\'", "", data)

In [7]:
def sent_to_words(sentences):
    yield(gensim.utils.simple_preprocess(str(sentences), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])



In [8]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])



In [9]:
nlp = spacy.load("en_core_web_sm")

# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [13]:
from nltk.stem import WordNetLemmatizer
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
data_words_bigrams

[['chapter',
  'da_vinci',
  'code',
  'dan',
  'brown',
  'prologue',
  'louvre',
  'museum',
  'paris',
  'renowned_curator',
  'jacques_sauniere',
  'staggered',
  'vaulted_archway',
  'louvre',
  'museums',
  'grand_gallery',
  'renowned_curator',
  'jacques_sauniere',
  'lunged',
  'nearest',
  'painting',
  'could',
  'see',
  'caravaggio',
  'grabbing',
  'gilded',
  'frame',
  'renowned_curator',
  'jacques_sauniere',
  'heaved',
  'nearest',
  'painting',
  'could',
  'see',
  'caravaggio',
  'toward',
  'renowned_curator',
  'jacques_sauniere',
  'nearest',
  'painting',
  'could',
  'see',
  'caravaggio',
  'tore',
  'wall',
  'renowned_curator',
  'jacques_sauniere',
  'collapsed',
  'backward',
  'heap',
  'beneath',
  'canvas',
  'renowned_curator',
  'jacques_sauniere',
  'anticipated',
  'thundering',
  'iron',
  'gate',
  'fell',
  'nearby',
  'barricading',
  'entrance',
  'suite',
  'parquet',
  'floor',
  'shook',
  'far',
  'alarm',
  'began',
  'ring',
  'renowned

In [24]:
import nltk
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
data_lemmatized = [lemmatizer.lemmatize(x) for y in data_words_bigrams for x in y]
# lemmatize data_words_bigrams
# data_lemmatized = lemmatizer.lemmatize(data_words_bigrams)

print(data_lemmatized)



[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kristianvankuijk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [28]:
data_lemmatized = [data_lemmatized]
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 3), (1, 1), (2, 4), (3, 1), (4, 7), (5, 1), (6, 1), (7, 1), (8, 7), (9, 2), (10, 14), (11, 1), (12, 1), (13, 1), (14, 1), (15, 2), (16, 1), (17, 4), (18, 1), (19, 5), (20, 1), (21, 4), (22, 2), (23, 6), (24, 1), (25, 2), (26, 1), (27, 1), (28, 3), (29, 2), (30, 5), (31, 2), (32, 1), (33, 3), (34, 1), (35, 1), (36, 13), (37, 1), (38, 6), (39, 6), (40, 4), (41, 15), (42, 4), (43, 2), (44, 2), (45, 6), (46, 1), (47, 2), (48, 7), (49, 1), (50, 20), (51, 1), (52, 6), (53, 1), (54, 6), (55, 1), (56, 1), (57, 2), (58, 1), (59, 1), (60, 5), (61, 1), (62, 12), (63, 6), (64, 2), (65, 1), (66, 1), (67, 2), (68, 18), (69, 24), (70, 30), (71, 1), (72, 1), (73, 3), (74, 4), (75, 4), (76, 1), (77, 5), (78, 4), (79, 3), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 4), (87, 1), (88, 121), (89, 20), (90, 1), (91, 2), (92, 16), (93, 3), (94, 8), (95, 3), (96, 3), (97, 1), (98, 56), (99, 2), (100, 2), (101, 4), (102, 13), (103, 20), (104, 6), (105, 4), (106, 1), (107, 1), (108, 2), (10

In [30]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('aah', 3),
  ('abandon', 1),
  ('abandoned', 4),
  ('abbe', 1),
  ('abbey', 7),
  ('abbreviated', 1),
  ('abdomen', 1),
  ('abided', 1),
  ('ability', 7),
  ('ablaze', 2),
  ('able', 14),
  ('aboard', 1),
  ('abolish', 1),
  ('abounded', 1),
  ('abracadabra', 1),
  ('abreast', 2),
  ('abrupt', 1),
  ('abruptly', 4),
  ('absenle', 1),
  ('absent', 5),
  ('absente', 1),
  ('absently', 4),
  ('absolute', 2),
  ('absolutely', 6),
  ('absolution', 1),
  ('absorb', 2),
  ('abstinence', 1),
  ('abstractedly', 1),
  ('absurd', 3),
  ('absurdity', 2),
  ('absurdly', 5),
  ('abu', 2),
  ('abundance', 1),
  ('abuse', 3),
  ('abused', 1),
  ('abyss', 1),
  ('academic', 13),
  ('accelerate', 1),
  ('accelerated', 6),
  ('accelerating', 6),
  ('accelerator', 4),
  ('accent', 15),
  ('accented', 4),
  ('accentuated', 2),
  ('accentuating', 2),
  ('accept', 6),
  ('acceptable', 1),
  ('acceptance', 2),
  ('accepted', 7),
  ('accepting', 1),
  ('access', 20),
  ('accessed', 1),
  ('accessible', 6),


In [31]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [32]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.002*"langdon" + 0.001*"sophie" + 0.001*"robert" + 0.001*"silas" + '
  '0.001*"langdons" + 0.001*"said" + 0.001*"neveu" + 0.000*"one" + '
  '0.000*"teabings" + 0.000*"sophies"'),
 (1,
  '0.002*"langdon" + 0.001*"sophie" + 0.001*"robert" + 0.001*"silas" + '
  '0.000*"grandfather" + 0.000*"teabing" + 0.000*"one" + 0.000*"langdons" + '
  '0.000*"sophies" + 0.000*"said"'),
 (2,
  '0.003*"langdon" + 0.002*"sophie" + 0.001*"robert" + 0.001*"silas" + '
  '0.001*"teabings" + 0.001*"said" + 0.001*"grandfather" + 0.001*"langdons" + '
  '0.001*"teabing" + 0.001*"sophies"'),
 (3,
  '0.003*"langdon" + 0.002*"sophie" + 0.001*"robert" + 0.001*"langdons" + '
  '0.001*"teabing" + 0.001*"grandfather" + 0.001*"silas" + 0.001*"sophies" + '
  '0.001*"said" + 0.001*"teabings"'),
 (4,
  '0.004*"langdon" + 0.003*"sophie" + 0.001*"robert" + 0.001*"sophies" + '
  '0.001*"silas" + 0.001*"langdons" + 0.001*"neveu" + 0.001*"teabings" + '
  '0.001*"grandfather" + 0.001*"one"'),
 (5,
  '0.001*"sophie" + 0.0

In [34]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis