# Years Topic Modelling Tests

This is a basic attempt to do some topic modelling for *The Years* and perhaps see where it leads. I'd really like to see how the published text differs from pre-publication states but let's take it one step at a time.

### Imports and Whatnot

In [25]:
import sys
import os
from pprint import pprint as pp

# import spaCy
# import spacy
# from spacy import displacy

# import gensim
from gensim import corpora, models, similarities, downloader
from gensim.utils import simple_preprocess

# import numpy
import numpy as np

# import pandas
import pandas as pd

# import BERTopic
from bertopic import BERTopic

# import nltk
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# import pyldavis
import pyLDAvis
import pyLDAvis.gensim_models

import nbformat

### Setting up corpus and cleaning data 

In [39]:
corpus = [
  'data/1880.txt',
  'data/1891.txt',
  'data/1907.txt',
  'data/1908.txt',
  'data/1910.txt',
  'data/1911.txt',
  'data/1913.txt',
  'data/1914.txt',
  'data/1917.txt',
  'data/1918.txt',
  'data/present.txt'
]

# setting nltk resources and stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
default_stopwords_set = set(stopwords.words('english'))
# custom_stopwords = ["pargiter", "abel", "eleanor", "sara", "maggie", "renny", "north", "rose", "delia", "martin", "crosby", "milly", "kitty", "lasswade", "peggy", "said"]
# combined_stopwords = default_stopwords_set.union(custom_stopwords)

# tokenizing the corpus
tokenized_corpus = [simple_preprocess(open(file).read()) for file in corpus]
#print(tokenized_corpus)

# removing stopwords and proper nouns
corpus_no_stopwords = [[word for word in doc if word not in default_stopwords_set] for doc in tokenized_corpus]

def remove_proper_nouns(doc):
  tagged = pos_tag(doc)
  #print(tagged)
  return [word for word, tag in tagged if tag != 'NNP']

filtered_corpus = [remove_proper_nouns(doc) for doc in corpus_no_stopwords]
#print(filtered_corpus)



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joshua/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/joshua/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/joshua/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## LDA test run

In [40]:
# making a big Bag of Words
dictionary = corpora.Dictionary(filtered_corpus)
corpus_bow = [dictionary.doc2bow(doc) for doc in filtered_corpus]

# training LDA model
lda_model = models.LdaModel(corpus_bow, num_topics=50, id2word=dictionary, passes=15)
topics = lda_model.print_topics(num_words=5)

# visualizing the model
ldavis = pyLDAvis.gensim_models.prepare(lda_model, corpus_bow, dictionary)
pyLDAvis.display(ldavis)

## BERTopic (is there an ERNIETopic?)

In [41]:
# converting filtered corpus to string for BERT to consume
filtered_corpus_to_string = [' '.join(doc) for doc in filtered_corpus]
print(filtered_corpus_to_string)

# training BERTopic model
topic_model = BERTopic(min_topic_size=10, calculate_probabilities=True)
topics, probabilities = topic_model.fit_transform(filtered_corpus_to_string)

topic_model.get_topic_info()




Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,11,-1_thought_looked_little_one,"[thought, looked, little, one, like, hand, loo...",[brilliant spring day radiant even air seemed ...
