## Load the data

In [131]:
# Load text data from output.txt
with open('Unstructured_Corpora.txt', 'r') as f:    
    text_data = [line.rstrip() for line in f.readlines()]

# Pre-Processing

## Punctuation and Tokenization

In [132]:
import re

# Define a regular expression pattern to match punctuation
punct_pattern = r'[^\w\s]|_'

# Define a list to store the tokenized documents
tokens = []

# Tokenize each document
for doc in text_data:
    # Remove punctuation using re.sub
    doc = re.sub(punct_pattern, '', doc)
    
    # Tokenize the document using word_tokenize from NLTK
    doc_tokens = word_tokenize(doc)
    
    # Append the tokens to the list
    tokens.append(doc_tokens)


## Stop word removal

In [133]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

filtered_docs = [[token for token in doc if token not in stop_words] for doc in tokens]

## Stemming

In [134]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

stemmed_docs = [[stemmer.stem(token) for token in doc] for doc in filtered_docs]


## Bag of words

In [135]:
import gensim

# Create a dictionary from the stemmed documents
dictionary = gensim.corpora.Dictionary(stemmed_docs)

# Create a gensim corpus from the stemmed documents
corpus = [dictionary.doc2bow(doc) for doc in stemmed_docs]

INFO:gensim.corpora.dictionary:adding document #0 to Dictionary(0 unique tokens: [])
INFO:gensim.corpora.dictionary:built Dictionary(126747 unique tokens: ['15', '2', '2023', '2023a', '2023explor']...) from 6055 documents (total 3940529 corpus positions)
INFO:gensim.utils:Dictionary lifecycle event {'msg': "built Dictionary(126747 unique tokens: ['15', '2', '2023', '2023a', '2023explor']...) from 6055 documents (total 3940529 corpus positions)", 'datetime': '2023-05-04T13:23:33.649981', 'gensim': '4.1.2', 'python': '3.9.15 (main, Nov 24 2022, 14:31:59) \n[GCC 11.2.0]', 'platform': 'Linux-5.13.0-1025-aws-x86_64-with-glibc2.31', 'event': 'created'}


# LDA

In [136]:
%%capture
# Train an LDA model on the corpus
num_topics = 10
lda_model = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)


INFO:gensim.models.ldamodel:using symmetric alpha at 0.1
INFO:gensim.models.ldamodel:using symmetric eta at 0.1
INFO:gensim.models.ldamodel:using serial LDA version on this node
INFO:gensim.models.ldamodel:running online (multi-pass) LDA training, 10 topics, 10 passes over the supplied corpus of 6055 documents, updating model once every 2000 documents, evaluating perplexity every 6055 documents, iterating 50x with a convergence threshold of 0.001000
INFO:gensim.models.ldamodel:PROGRESS: pass 0, at document #2000/6055
INFO:gensim.models.ldamodel:merging changes from 2000 documents into a model of 6055 documents
INFO:gensim.models.ldamodel:topic #8 (0.100): 0.011*"use" + 0.009*"page" + 0.008*"set" + 0.008*"improv" + 0.008*"cooki" + 0.008*"agenc" + 0.007*"addit" + 0.007*"search" + 0.007*"help" + 0.007*"govuk"
INFO:gensim.models.ldamodel:topic #1 (0.100): 0.012*"govuk" + 0.011*"use" + 0.010*"page" + 0.009*"improv" + 0.008*"addit" + 0.008*"help" + 0.007*"search" + 0.007*"cooki" + 0.006*"gov

INFO:gensim.models.ldamodel:topic #1 (0.100): 0.013*"uk" + 0.011*"govuk" + 0.010*"use" + 0.010*"cooki" + 0.009*"govern" + 0.009*"page" + 0.008*"help" + 0.008*"addit" + 0.008*"set" + 0.008*"support"
INFO:gensim.models.ldamodel:topic #6 (0.100): 0.012*"govern" + 0.012*"cooki" + 0.012*"use" + 0.011*"govuk" + 0.010*"new" + 0.009*"set" + 0.009*"help" + 0.009*"page" + 0.008*"search" + 0.008*"us"
INFO:gensim.models.ldamodel:topic #7 (0.100): 0.014*"health" + 0.010*"new" + 0.009*"govern" + 0.008*"use" + 0.008*"cooki" + 0.007*"govuk" + 0.007*"help" + 0.007*"page" + 0.007*"set" + 0.006*"improv"
INFO:gensim.models.ldamodel:topic #4 (0.100): 0.016*"cooki" + 0.016*"govuk" + 0.015*"use" + 0.012*"page" + 0.012*"addit" + 0.012*"set" + 0.011*"search" + 0.010*"improv" + 0.010*"help" + 0.009*"govern"
INFO:gensim.models.ldamodel:topic diff=0.259068, rho=0.445989
INFO:gensim.models.ldamodel:-7.828 per-word bound, 227.3 perplexity estimate based on a held-out corpus of 55 documents with 35581 words
INFO:gen

INFO:gensim.models.ldamodel:topic #8 (0.100): 0.008*"i" + 0.008*"climat" + 0.008*"water" + 0.008*"environ" + 0.007*"need" + 0.006*"chang" + 0.006*"use" + 0.005*"also" + 0.005*"work" + 0.005*"govern"
INFO:gensim.models.ldamodel:topic diff=0.218794, rho=0.377224
INFO:gensim.models.ldamodel:PROGRESS: pass 3, at document #4000/6055
INFO:gensim.models.ldamodel:merging changes from 2000 documents into a model of 6055 documents
INFO:gensim.models.ldamodel:topic #8 (0.100): 0.008*"climat" + 0.008*"water" + 0.008*"i" + 0.007*"environ" + 0.007*"need" + 0.007*"chang" + 0.006*"use" + 0.005*"work" + 0.005*"also" + 0.005*"govern"
INFO:gensim.models.ldamodel:topic #6 (0.100): 0.013*"use" + 0.012*"govuk" + 0.012*"govern" + 0.012*"cooki" + 0.011*"new" + 0.010*"page" + 0.010*"set" + 0.010*"help" + 0.009*"us" + 0.009*"search"
INFO:gensim.models.ldamodel:topic #7 (0.100): 0.019*"health" + 0.009*"regul" + 0.009*"new" + 0.008*"use" + 0.008*"air" + 0.008*"medic" + 0.007*"help" + 0.007*"cooki" + 0.006*"govern

INFO:gensim.models.ldamodel:topic #2 (0.100): 0.013*"new" + 0.010*"improv" + 0.010*"open" + 0.010*"use" + 0.009*"local" + 0.008*"project" + 0.008*"england" + 0.008*"page" + 0.007*"govuk" + 0.007*"help"
INFO:gensim.models.ldamodel:topic #4 (0.100): 0.017*"cooki" + 0.017*"govuk" + 0.016*"use" + 0.013*"page" + 0.013*"set" + 0.012*"search" + 0.012*"addit" + 0.010*"improv" + 0.010*"help" + 0.009*"govern"
INFO:gensim.models.ldamodel:topic #7 (0.100): 0.013*"health" + 0.011*"regul" + 0.010*"new" + 0.009*"medic" + 0.008*"use" + 0.008*"devic" + 0.007*"air" + 0.006*"help" + 0.006*"govuk" + 0.006*"cooki"
INFO:gensim.models.ldamodel:topic #9 (0.100): 0.024*"railway" + 0.023*"rail" + 0.019*"hs2" + 0.018*"speed" + 0.010*"erg" + 0.008*"line" + 0.008*"phase" + 0.008*"billion" + 0.007*"librari" + 0.006*"suiss"
INFO:gensim.models.ldamodel:topic diff=0.211462, rho=0.352947
INFO:gensim.models.ldamodel:PROGRESS: pass 5, at document #2000/6055
INFO:gensim.models.ldamodel:merging changes from 2000 documents 

INFO:gensim.models.ldamodel:topic diff=0.093474, rho=0.315794
INFO:gensim.models.ldamodel:PROGRESS: pass 6, at document #6000/6055
INFO:gensim.models.ldamodel:merging changes from 2000 documents into a model of 6055 documents
INFO:gensim.models.ldamodel:topic #9 (0.100): 0.030*"hs2" + 0.019*"rail" + 0.014*"railway" + 0.013*"librari" + 0.012*"phase" + 0.012*"speed" + 0.006*"dementia" + 0.006*"freight" + 0.005*"ltd" + 0.005*"billion"
INFO:gensim.models.ldamodel:topic #5 (0.100): 0.021*"natur" + 0.009*"england" + 0.009*"help" + 0.009*"tree" + 0.008*"wildlif" + 0.008*"plant" + 0.007*"new" + 0.007*"improv" + 0.007*"habitat" + 0.007*"govuk"
INFO:gensim.models.ldamodel:topic #8 (0.100): 0.008*"climat" + 0.007*"environ" + 0.007*"i" + 0.007*"chang" + 0.006*"need" + 0.006*"govern" + 0.006*"work" + 0.005*"also" + 0.005*"year" + 0.005*"water"
INFO:gensim.models.ldamodel:topic #3 (0.100): 0.028*"flood" + 0.011*"agenc" + 0.011*"use" + 0.011*"govuk" + 0.011*"cooki" + 0.010*"page" + 0.009*"help" + 0.0

INFO:gensim.models.ldamodel:topic #9 (0.100): 0.019*"rail" + 0.018*"hs2" + 0.018*"librari" + 0.017*"railway" + 0.013*"speed" + 0.009*"billion" + 0.009*"line" + 0.007*"phase" + 0.007*"erg" + 0.006*"dementia"
INFO:gensim.models.ldamodel:topic #1 (0.100): 0.014*"uk" + 0.012*"govuk" + 0.012*"cooki" + 0.011*"use" + 0.010*"govern" + 0.010*"page" + 0.009*"set" + 0.009*"addit" + 0.009*"help" + 0.008*"support"
INFO:gensim.models.ldamodel:topic #2 (0.100): 0.013*"new" + 0.010*"improv" + 0.010*"open" + 0.010*"local" + 0.009*"project" + 0.009*"use" + 0.008*"england" + 0.007*"page" + 0.007*"help" + 0.007*"govuk"
INFO:gensim.models.ldamodel:topic diff=0.153717, rho=0.288345
INFO:gensim.models.ldamodel:PROGRESS: pass 8, at document #4000/6055
INFO:gensim.models.ldamodel:merging changes from 2000 documents into a model of 6055 documents
INFO:gensim.models.ldamodel:topic #6 (0.100): 0.013*"use" + 0.013*"govuk" + 0.013*"cooki" + 0.012*"govern" + 0.011*"new" + 0.011*"page" + 0.010*"set" + 0.010*"help" + 

INFO:gensim.models.ldamodel:-7.157 per-word bound, 142.7 perplexity estimate based on a held-out corpus of 55 documents with 35581 words
INFO:gensim.models.ldamodel:PROGRESS: pass 9, at document #6055/6055
INFO:gensim.models.ldamodel:merging changes from 55 documents into a model of 6055 documents
INFO:gensim.models.ldamodel:topic #1 (0.100): 0.014*"uk" + 0.012*"govuk" + 0.012*"cooki" + 0.011*"use" + 0.010*"govern" + 0.010*"page" + 0.009*"set" + 0.009*"addit" + 0.009*"help" + 0.008*"search"
INFO:gensim.models.ldamodel:topic #5 (0.100): 0.018*"natur" + 0.009*"help" + 0.009*"forest" + 0.008*"england" + 0.008*"tree" + 0.007*"wildlif" + 0.007*"woodland" + 0.007*"plant" + 0.007*"govuk" + 0.007*"improv"
INFO:gensim.models.ldamodel:topic #8 (0.100): 0.007*"climat" + 0.007*"i" + 0.007*"environ" + 0.007*"chang" + 0.006*"govern" + 0.006*"need" + 0.006*"work" + 0.006*"also" + 0.005*"year" + 0.005*"make"
INFO:gensim.models.ldamodel:topic #0 (0.100): 0.017*"busi" + 0.012*"innov" + 0.011*"support" +

In [137]:
%%capture
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

# Convert the gensim LDA model to a format that pyLDAvis can use
vis_data = gensimvis.prepare(lda_model, corpus, dictionary)

  -12.15087364]
 [ -5.88574248  -7.36145454  -6.20580701 ...  -6.47788808  -6.11558862
  -12.14815904]
 [ -8.63291732  -7.96308229  -9.70172735 ...  -9.28670046 -14.35622578
  -12.15100739]
 ...
 [-15.992385   -15.23053983 -15.1318475  ... -14.66308252 -14.35909161
  -12.1510877 ]
 [-15.992385   -15.23053983 -15.1318475  ... -14.66308252 -14.35909161
  -12.1510877 ]
 [-15.992385   -15.23053983 -15.1318475  ... -14.66308252 -14.35909161
  -12.15087364]
 [ -5.88574248  -7.36145454  -6.20580701 ...  -6.47788808  -6.11558862
  -12.14815904]
 [ -8.63291732  -7.96308229  -9.70172735 ...  -9.28670046 -14.35622578
  -12.15100739]
 ...
 [-15.992385   -15.23053983 -15.1318475  ... -14.66308252 -14.35909161
  -12.1510877 ]
 [-15.992385   -15.23053983 -15.1318475  ... -14.66308252 -14.35909161
  -12.1510877 ]
 [-15.992385   -15.23053983 -15.1318475  ... -14.66308252 -14.35909161
  -12.15087364]
 [ -5.88574248  -7.36145454  -6.20580701 ...  -6.47788808  -6.11558862
  -12.14815904]
 [ -8.63291732  -

  -12.15087364]
 [ -5.88574248  -7.36145454  -6.20580701 ...  -6.47788808  -6.11558862
  -12.14815904]
 [ -8.63291732  -7.96308229  -9.70172735 ...  -9.28670046 -14.35622578
  -12.15100739]
 ...
 [-15.992385   -15.23053983 -15.1318475  ... -14.66308252 -14.35909161
  -12.1510877 ]
 [-15.992385   -15.23053983 -15.1318475  ... -14.66308252 -14.35909161
  -12.1510877 ]
 [-15.992385   -15.23053983 -15.1318475  ... -14.66308252 -14.35909161
  -3.96585057]
 [ 0.3375322  -1.13817986  0.01746767 ... -0.25461341  0.10768606
  -5.92488436]
 [ 0.27563538  0.94547041 -0.79317464 ... -0.37814775 -5.44767307
  -3.24245469]
 ...
 [-0.77324685 -0.01140168  0.08729064 ...  0.55605563  0.86004653
   3.06805044]
 [-0.77324685 -0.01140168  0.08729064 ...  0.55605563  0.86004653
   3.06805044]
 [-0.77324685 -0.01140168  0.08729064 ...  0.55605563  0.86004653
  -12.15087364]
 [ -5.88574248  -7.36145454  -6.20580701 ...  -6.47788808  -6.11558862
  -12.14815904]
 [ -8.63291732  -7.96308229  -9.70172735 ...  -

  -12.15087364]
 [ -5.88574248  -7.36145454  -6.20580701 ...  -6.47788808  -6.11558862
  -12.14815904]
 [ -8.63291732  -7.96308229  -9.70172735 ...  -9.28670046 -14.35622578
  -12.15100739]
 ...
 [-15.992385   -15.23053983 -15.1318475  ... -14.66308252 -14.35909161
  -12.1510877 ]
 [-15.992385   -15.23053983 -15.1318475  ... -14.66308252 -14.35909161
  -12.1510877 ]
 [-15.992385   -15.23053983 -15.1318475  ... -14.66308252 -14.35909161
  -3.96585057]
 [ 0.3375322  -1.13817986  0.01746767 ... -0.25461341  0.10768606
  -5.92488436]
 [ 0.27563538  0.94547041 -0.79317464 ... -0.37814775 -5.44767307
  -3.24245469]
 ...
 [-0.77324685 -0.01140168  0.08729064 ...  0.55605563  0.86004653
   3.06805044]
 [-0.77324685 -0.01140168  0.08729064 ...  0.55605563  0.86004653
   3.06805044]
 [-0.77324685 -0.01140168  0.08729064 ...  0.55605563  0.86004653
  -12.15087364]
 [ -5.88574248  -7.36145454  -6.20580701 ...  -6.47788808  -6.11558862
  -12.14815904]
 [ -8.63291732  -7.96308229  -9.70172735 ...  -

  -12.15087364]
 [ -5.88574248  -7.36145454  -6.20580701 ...  -6.47788808  -6.11558862
  -12.14815904]
 [ -8.63291732  -7.96308229  -9.70172735 ...  -9.28670046 -14.35622578
  -12.15100739]
 ...
 [-15.992385   -15.23053983 -15.1318475  ... -14.66308252 -14.35909161
  -12.1510877 ]
 [-15.992385   -15.23053983 -15.1318475  ... -14.66308252 -14.35909161
  -12.1510877 ]
 [-15.992385   -15.23053983 -15.1318475  ... -14.66308252 -14.35909161
  -3.96585057]
 [ 0.3375322  -1.13817986  0.01746767 ... -0.25461341  0.10768606
  -5.92488436]
 [ 0.27563538  0.94547041 -0.79317464 ... -0.37814775 -5.44767307
  -3.24245469]
 ...
 [-0.77324685 -0.01140168  0.08729064 ...  0.55605563  0.86004653
   3.06805044]
 [-0.77324685 -0.01140168  0.08729064 ...  0.55605563  0.86004653
   3.06805044]
 [-0.77324685 -0.01140168  0.08729064 ...  0.55605563  0.86004653
  -12.15087364]
 [ -5.88574248  -7.36145454  -6.20580701 ...  -6.47788808  -6.11558862
  -12.14815904]
 [ -8.63291732  -7.96308229  -9.70172735 ...  -

  -12.15087364]
 [ -5.88574248  -7.36145454  -6.20580701 ...  -6.47788808  -6.11558862
  -12.14815904]
 [ -8.63291732  -7.96308229  -9.70172735 ...  -9.28670046 -14.35622578
  -12.15100739]
 ...
 [-15.992385   -15.23053983 -15.1318475  ... -14.66308252 -14.35909161
  -12.1510877 ]
 [-15.992385   -15.23053983 -15.1318475  ... -14.66308252 -14.35909161
  -12.1510877 ]
 [-15.992385   -15.23053983 -15.1318475  ... -14.66308252 -14.35909161
  -3.96585057]
 [ 0.3375322  -1.13817986  0.01746767 ... -0.25461341  0.10768606
  -5.92488436]
 [ 0.27563538  0.94547041 -0.79317464 ... -0.37814775 -5.44767307
  -3.24245469]
 ...
 [-0.77324685 -0.01140168  0.08729064 ...  0.55605563  0.86004653
   3.06805044]
 [-0.77324685 -0.01140168  0.08729064 ...  0.55605563  0.86004653
   3.06805044]
 [-0.77324685 -0.01140168  0.08729064 ...  0.55605563  0.86004653


In [138]:
pyLDAvis.display(vis_data)