# Topic Analysis - pyLDAvis

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import spacy
import gensim
from gensim import corpora
from gensim.models import LdaModel
import pyLDAvis.gensim_models

In [2]:
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" # Supress tensorflow warnings

## Data Preprocessing

### Importing the Dataset

In [3]:
current_dir = os.path.dirname(os.path.realpath('__file__'))
root = os.path.join(current_dir, "Books")

data = []

# Loop through author and book folders, adding .txt file contents to data with matching labels
for author in os.listdir(root):
    author_path = os.path.join(root, author)

    for book in os.listdir(author_path):
        book_path = os.path.join(author_path, book)

        for chapter in os.listdir(book_path):
            chapter_path = os.path.join(book_path, chapter)
            
            with open(chapter_path, 'r', encoding='utf-8') as file:
                text = file.read()
                
            data.append({
                'Author': author,
                'Book': book,
                'Chapter': chapter[:-4],
                'Text': text
            })

dataset = pd.DataFrame(data)

### Splitting and Cleaning

In [4]:
lemmatizer = WordNetLemmatizer()
nlp = spacy.load("en_core_web_sm")

def preserve_names(text):
    chapter = nlp(text)
    processed_text = []

    for word in chapter:
        # Checking if proper noun using nlp
        if word.pos_ == "PROPN":  
            processed_text.append(word.text)
        else:
            processed_text.append(word.text.lower())

    return processed_text

def preprocess_text(text, all_stopwords):
    if isinstance(text, str):
        # Clean text
        text = re.sub(r'\n', " ", text)  # Newlines
        text = re.sub(r'[^a-zA-Z\s]', " ", text)  # Punctuation and special characters
        text = re.sub(r'\s+', ' ', text).strip()  # Extra spaces
        words = preserve_names(text)
    else:
        words = text

    # Apply lemmatizer
    processed_text = [lemmatizer.lemmatize(word) for word in words if word not in all_stopwords]
    
    return processed_text

In [5]:
nltk.download('wordnet', quiet = True)
nltk.download('stopwords', quiet = True)
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')

# Apply our cleaning and create a new dataset to replace our previous one, this time with processed text
processed_text = []
for index, row in dataset.iterrows():
    text = row["Text"]
    processed_chapter = preprocess_text(text, all_stopwords)
    processed_text.append(processed_chapter)

# Convert the data fragments into a Pandas DataFrame and replace the original
dataset['Text'] = processed_text

In [6]:
dataset.head()

Unnamed: 0,Author,Book,Chapter,Text
0,Brandon Sanderson,The Way of Kings,1,"[killed, Bastards, killed, sun, still, hot, di..."
1,Brandon Sanderson,The Way of Kings,10,"[nine, year, AGO, Kal, stumbled, surgery, room..."
2,Brandon Sanderson,The Way of Kings,2,"[ten, order, loved, forsaken, u, Almighty, Sha..."
3,Brandon Sanderson,The Way of Kings,3,"[man, stood, cliffside, watched, homeland, fal..."
4,Brandon Sanderson,The Way of Kings,4,"[dying, Healer, take, blood, beside, head, lin..."


## Encoding and Tokenisation

### Encoding

In [7]:
# Create a dictionary mapping words to unique IDs
dictionary = corpora.Dictionary(dataset["Text"])

# Convert documents into bag-of-words format
corpus = [dictionary.doc2bow(text) for text in dataset["Text"]]

## Creating and Training Model

In [8]:
num_topics = 12  # Adjust based on dataset size

lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

# Print top words in each topic
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.025*"said" + 0.011*"Stephen" + 0.011*"Mr" + 0.006*"Mulligan" + 0.005*"sir"')
(1, '0.011*"said" + 0.006*"Mr" + 0.005*"one" + 0.005*"Father" + 0.005*"not"')
(2, '0.012*"not" + 0.009*"said" + 0.009*"Kaladin" + 0.008*"one" + 0.005*"could"')
(3, '0.008*"said" + 0.007*"Perrin" + 0.007*"like" + 0.005*"one" + 0.004*"Mr"')
(4, '0.011*"said" + 0.008*"Perrin" + 0.006*"Kal" + 0.005*"not" + 0.005*"Slayer"')
(5, '0.000*"said" + 0.000*"not" + 0.000*"one" + 0.000*"would" + 0.000*"like"')
(6, '0.010*"not" + 0.007*"one" + 0.007*"said" + 0.005*"like" + 0.005*"would"')
(7, '0.011*"said" + 0.010*"not" + 0.007*"man" + 0.006*"Utterson" + 0.006*"one"')
(8, '0.016*"Perrin" + 0.013*"wolf" + 0.013*"Hopper" + 0.005*"sent" + 0.005*"dream"')
(9, '0.029*"Perrin" + 0.016*"wolf" + 0.011*"Hopper" + 0.009*"Elayne" + 0.007*"Ituralde"')
(10, '0.015*"not" + 0.010*"one" + 0.010*"said" + 0.009*"would" + 0.007*"like"')
(11, '0.010*"not" + 0.008*"said" + 0.007*"Mr" + 0.006*"one" + 0.005*"would"')


In [9]:
def get_topic_distribution(text):
    bow_vector = dictionary.doc2bow(preprocess_text(text, all_stopwords))
    topic_dist = lda_model.get_document_topics(bow_vector)
    return topic_dist

dataset["topic_distribution"] = dataset["Text"].apply(get_topic_distribution)
print(dataset[["Author", "topic_distribution"]].head())

              Author                   topic_distribution
0  Brandon Sanderson                     [(7, 0.9996418)]
1  Brandon Sanderson  [(4, 0.9804581), (10, 0.018887423)]
2  Brandon Sanderson                     [(2, 0.9989753)]
3  Brandon Sanderson                    [(10, 0.9996758)]
4  Brandon Sanderson                    [(2, 0.99958545)]


In [10]:
from gensim.models import CoherenceModel

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=dataset['Text'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f"Coherence Score: {coherence_lda}")

Coherence Score: 0.28762977194970535


In [11]:
pyLDAvis.enable_notebook()

# Assuming you have your `lda_model`, `corpus`, and `dictionary`
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)