# Years Topic Modelling Tests

This is a basic attempt to do some topic modelling for *The Years* and perhaps see where it leads.

In [17]:
import sys
import os

# import spaCy
# import spacy
# from spacy import displacy

# import gensim
from gensim import corpora, models, similarities, downloader
from gensim.utils import simple_preprocess

# import numpy
import numpy as np

# import pandas
import pandas as pd

# import nltk
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# import pyldavis
import pyLDAvis
import pyLDAvis.gensim_models

In [41]:
# corpus setup

corpus = [
  'data/1880.txt',
  'data/1891.txt',
  'data/1907.txt',
  'data/1908.txt',
  'data/1910.txt',
  'data/1911.txt',
  'data/1913.txt',
  'data/1914.txt',
  'data/1917.txt',
  'data/1918.txt',
  'data/present.txt'
]

# setting nltk resources and stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
stopwords_set = set(stopwords.words('english'))

# tokenizing the corpus
tokenized_corpus = [simple_preprocess(open(file).read()) for file in corpus]
#print(tokenized_corpus)

# removing stopwords and proper nouns
corpus_no_stopwords = [[word for word in doc if word not in stopwords_set] for doc in tokenized_corpus]

def remove_proper_nouns(doc):
  tagged = pos_tag(doc)
  #print(tagged)
  return [word for word, tag in tagged if tag != 'NNP']

filtered_corpus = [remove_proper_nouns(doc) for doc in corpus_no_stopwords]
#print(filtered_corpus)



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joshua/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/joshua/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/joshua/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('autumn', 'NN'), ('wind', 'NN'), ('blew', 'VBD'), ('england', 'NN'), ('twitched', 'VBN'), ('leaves', 'VBZ'), ('trees', 'NNS'), ('fluttered', 'VBD'), ('spotted', 'VBN'), ('red', 'JJ'), ('yellow', 'JJ'), ('sent', 'VBD'), ('floating', 'VBG'), ('flaunting', 'VBG'), ('wide', 'JJ'), ('curves', 'NNS'), ('settled', 'VBD'), ('towns', 'NNS'), ('coming', 'VBG'), ('gusts', 'NNS'), ('round', 'VBD'), ('corners', 'NNS'), ('wind', 'VBP'), ('blew', 'JJ'), ('hat', 'RB'), ('lifted', 'VBD'), ('veil', 'JJ'), ('high', 'JJ'), ('woman', 'NN'), ('head', 'VBP'), ('money', 'NN'), ('brisk', 'JJ'), ('circulation', 'NN'), ('streets', 'NNS'), ('crowded', 'VBD'), ('upon', 'IN'), ('sloping', 'VBG'), ('desks', 'NNS'), ('offices', 'NNS'), ('near', 'IN'), ('st', 'NN'), ('paul', 'NN'), ('clerks', 'NNS'), ('paused', 'VBD'), ('pens', 'NNS'), ('ruled', 'VBN'), ('page', 'NN'), ('difficult', 'JJ'), ('work', 'NN'), ('holidays', 'NNS'), ('margate', 'VBP'), ('eastbourne', 'JJ'), ('brighton', 'NN'), ('bronzed', 'VBN'), ('tanned'

## LDA test run

In [42]:
# making a big Bag of Words
dictionary = corpora.Dictionary(filtered_corpus)
corpus_bow = [dictionary.doc2bow(doc) for doc in filtered_corpus]

# training LDA model
lda_model = models.LdaModel(corpus_bow, num_topics=10, id2word=dictionary, passes=15)
topics = lda_model.print_topics(num_words=5)

# visualizing the model
ldavis = pyLDAvis.gensim_models.prepare(lda_model, corpus_bow, dictionary)
pyLDAvis.display(ldavis)

## Other approaches?