## Load the data

In [139]:
# Load text data from output.txt
with open('Unstructured_Corpora.txt', 'r') as f:    
    text_data = [line.rstrip() for line in f.readlines()]

# Pre-Processing

## Punctuation and Tokenization

In [140]:
import re

# Define a regular expression pattern to match punctuation
punct_pattern = r'[^\w\s]|_'

# Define a list to store the tokenized documents
tokens = []

# Tokenize each document
for doc in text_data:
    # Remove punctuation using re.sub
    doc = re.sub(punct_pattern, '', doc)
    
    # Tokenize the document using word_tokenize from NLTK
    doc_tokens = word_tokenize(doc)
    
    # Append the tokens to the list
    tokens.append(doc_tokens)


## Stop word removal

In [141]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

filtered_docs = [[token for token in doc if token not in stop_words] for doc in tokens]

## Stemming

In [142]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

stemmed_docs = [[stemmer.stem(token) for token in doc] for doc in filtered_docs]


## Bag of words

In [143]:
import gensim

# Create a dictionary from the stemmed documents
dictionary = gensim.corpora.Dictionary(stemmed_docs)

# Create a gensim corpus from the stemmed documents
corpus = [dictionary.doc2bow(doc) for doc in stemmed_docs]

INFO:gensim.corpora.dictionary:adding document #0 to Dictionary(0 unique tokens: [])
INFO:gensim.corpora.dictionary:built Dictionary(126747 unique tokens: ['15', '2', '2023', '2023a', '2023explor']...) from 6055 documents (total 3940529 corpus positions)
INFO:gensim.utils:Dictionary lifecycle event {'msg': "built Dictionary(126747 unique tokens: ['15', '2', '2023', '2023a', '2023explor']...) from 6055 documents (total 3940529 corpus positions)", 'datetime': '2023-05-05T13:52:45.922248', 'gensim': '4.1.2', 'python': '3.9.15 (main, Nov 24 2022, 14:31:59) \n[GCC 11.2.0]', 'platform': 'Linux-5.13.0-1025-aws-x86_64-with-glibc2.31', 'event': 'created'}


# LDA

In [None]:
%%capture
# Train an LDA model on the corpus
num_topics = 10
lda_model = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)


In [None]:
%%capture
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

# Convert the gensim LDA model to a format that pyLDAvis can use
vis_data = gensimvis.prepare(lda_model, corpus, dictionary)

In [146]:
pyLDAvis.display(vis_data)