In [122]:
%%capture
!pip install lda
!pip install NLTK
!pip install pyLDAvis

## Load the data

In [123]:
# Load text data from output.txt
with open('output.txt', 'r') as f:    
    text_data = [line.rstrip() for line in f.readlines()]

# Pre-Processing

## Punctuation and Tokenization

In [124]:
import re

# Define a regular expression pattern to match punctuation
punct_pattern = r'[^\w\s]|_'

# Define a list to store the tokenized documents
tokens = []

# Tokenize each document
for doc in text_data:
    # Remove punctuation using re.sub
    doc = re.sub(punct_pattern, '', doc)
    
    # Tokenize the document using word_tokenize from NLTK
    doc_tokens = word_tokenize(doc)
    
    # Append the tokens to the list
    tokens.append(doc_tokens)


## Stop word removal

In [125]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

filtered_docs = [[token for token in doc if token not in stop_words] for doc in tokens]

## Stemming

In [126]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

stemmed_docs = [[stemmer.stem(token) for token in doc] for doc in filtered_docs]


## Bag of words

In [127]:
import gensim

# Create a dictionary from the stemmed documents
dictionary = gensim.corpora.Dictionary(stemmed_docs)

# Create a gensim corpus from the stemmed documents
corpus = [dictionary.doc2bow(doc) for doc in stemmed_docs]

INFO:gensim.corpora.dictionary:adding document #0 to Dictionary(0 unique tokens: [])
INFO:gensim.corpora.dictionary:built Dictionary(123937 unique tokens: ['15', '2', '2023', '2023a', '2023explor']...) from 6340 documents (total 3820146 corpus positions)
INFO:gensim.utils:Dictionary lifecycle event {'msg': "built Dictionary(123937 unique tokens: ['15', '2', '2023', '2023a', '2023explor']...) from 6340 documents (total 3820146 corpus positions)", 'datetime': '2023-05-03T11:22:46.265557', 'gensim': '4.1.2', 'python': '3.9.15 (main, Nov 24 2022, 14:31:59) \n[GCC 11.2.0]', 'platform': 'Linux-5.13.0-1025-aws-x86_64-with-glibc2.31', 'event': 'created'}


In [128]:
%%capture
# Train an LDA model on the corpus
num_topics = 10
lda_model = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)


INFO:gensim.models.ldamodel:using symmetric alpha at 0.1
INFO:gensim.models.ldamodel:using symmetric eta at 0.1
INFO:gensim.models.ldamodel:using serial LDA version on this node
INFO:gensim.models.ldamodel:running online (multi-pass) LDA training, 10 topics, 10 passes over the supplied corpus of 6340 documents, updating model once every 2000 documents, evaluating perplexity every 6340 documents, iterating 50x with a convergence threshold of 0.001000
INFO:gensim.models.ldamodel:PROGRESS: pass 0, at document #2000/6340
INFO:gensim.models.ldamodel:merging changes from 2000 documents into a model of 6340 documents
INFO:gensim.models.ldamodel:topic #2 (0.100): 0.013*"use" + 0.009*"govuk" + 0.009*"page" + 0.009*"new" + 0.008*"cooki" + 0.008*"help" + 0.008*"environ" + 0.008*"addit" + 0.008*"search" + 0.008*"improv"
INFO:gensim.models.ldamodel:topic #7 (0.100): 0.010*"cooki" + 0.009*"page" + 0.009*"improv" + 0.009*"use" + 0.009*"environ" + 0.007*"govuk" + 0.007*"addit" + 0.006*"flood" + 0.006*

INFO:gensim.models.ldamodel:topic #0 (0.100): 0.008*"i" + 0.007*"govern" + 0.007*"chang" + 0.007*"climat" + 0.007*"use" + 0.006*"help" + 0.006*"food" + 0.006*"need" + 0.005*"year" + 0.005*"busi"
INFO:gensim.models.ldamodel:topic #7 (0.100): 0.006*"plastic" + 0.003*"selfdriv" + 0.003*"rover" + 0.003*"mar" + 0.003*"environ" + 0.003*"cooki" + 0.002*"use" + 0.002*"page" + 0.002*"improv" + 0.002*"govuk"
INFO:gensim.models.ldamodel:topic #2 (0.100): 0.010*"use" + 0.009*"new" + 0.009*"improv" + 0.009*"help" + 0.008*"page" + 0.008*"water" + 0.008*"govuk" + 0.007*"environ" + 0.007*"fish" + 0.007*"cooki"
INFO:gensim.models.ldamodel:topic #9 (0.100): 0.017*"govuk" + 0.015*"cooki" + 0.014*"use" + 0.013*"flood" + 0.012*"addit" + 0.012*"page" + 0.012*"search" + 0.011*"set" + 0.010*"help" + 0.010*"improv"
INFO:gensim.models.ldamodel:topic diff=0.242066, rho=0.439799
INFO:gensim.models.ldamodel:-8.001 per-word bound, 256.3 perplexity estimate based on a held-out corpus of 340 documents with 217938 wor

INFO:gensim.models.ldamodel:topic #0 (0.100): 0.010*"i" + 0.007*"food" + 0.007*"govern" + 0.006*"need" + 0.006*"chang" + 0.006*"year" + 0.006*"work" + 0.005*"help" + 0.005*"make" + 0.005*"climat"
INFO:gensim.models.ldamodel:topic #2 (0.100): 0.010*"water" + 0.010*"use" + 0.009*"improv" + 0.009*"environ" + 0.009*"help" + 0.008*"new" + 0.008*"fish" + 0.008*"page" + 0.008*"govuk" + 0.007*"natur"
INFO:gensim.models.ldamodel:topic diff=0.205067, rho=0.373457
INFO:gensim.models.ldamodel:PROGRESS: pass 3, at document #4000/6340
INFO:gensim.models.ldamodel:merging changes from 2000 documents into a model of 6340 documents
INFO:gensim.models.ldamodel:topic #5 (0.100): 0.016*"govuk" + 0.014*"use" + 0.013*"cooki" + 0.013*"page" + 0.013*"govern" + 0.012*"addit" + 0.012*"search" + 0.010*"help" + 0.009*"improv" + 0.009*"us"
INFO:gensim.models.ldamodel:topic #1 (0.100): 0.020*"wast" + 0.011*"environ" + 0.011*"use" + 0.011*"agenc" + 0.009*"govuk" + 0.009*"page" + 0.008*"cooki" + 0.007*"illeg" + 0.007*

INFO:gensim.models.ldamodel:merging changes from 340 documents into a model of 6340 documents
INFO:gensim.models.ldamodel:topic #2 (0.100): 0.009*"use" + 0.009*"improv" + 0.009*"water" + 0.009*"help" + 0.009*"new" + 0.008*"england" + 0.008*"natur" + 0.008*"page" + 0.007*"govuk" + 0.007*"environ"
INFO:gensim.models.ldamodel:topic #9 (0.100): 0.018*"flood" + 0.014*"govuk" + 0.013*"use" + 0.012*"cooki" + 0.011*"page" + 0.011*"addit" + 0.010*"search" + 0.010*"help" + 0.009*"improv" + 0.009*"new"
INFO:gensim.models.ldamodel:topic #0 (0.100): 0.011*"i" + 0.008*"food" + 0.007*"govern" + 0.006*"year" + 0.006*"need" + 0.006*"work" + 0.006*"busi" + 0.005*"chang" + 0.005*"help" + 0.005*"make"
INFO:gensim.models.ldamodel:topic #8 (0.100): 0.011*"use" + 0.010*"govuk" + 0.009*"new" + 0.009*"help" + 0.008*"page" + 0.008*"cooki" + 0.008*"support" + 0.008*"govern" + 0.008*"improv" + 0.007*"addit"
INFO:gensim.models.ldamodel:topic #3 (0.100): 0.020*"uk" + 0.010*"govern" + 0.008*"govuk" + 0.008*"use" + 0

INFO:gensim.models.ldamodel:topic #7 (0.100): 0.006*"de" + 0.004*"la" + 0.003*"mar" + 0.003*"rover" + 0.003*"sikh" + 0.002*"interreg" + 0.002*"brest" + 0.002*"douarnenez" + 0.002*"recherch" + 0.002*"plastic"
INFO:gensim.models.ldamodel:topic #5 (0.100): 0.019*"govuk" + 0.017*"cooki" + 0.016*"use" + 0.014*"page" + 0.014*"search" + 0.014*"addit" + 0.013*"govern" + 0.011*"set" + 0.011*"help" + 0.010*"improv"
INFO:gensim.models.ldamodel:topic diff=0.082082, rho=0.313574
INFO:gensim.models.ldamodel:PROGRESS: pass 6, at document #6000/6340
INFO:gensim.models.ldamodel:merging changes from 2000 documents into a model of 6340 documents
INFO:gensim.models.ldamodel:topic #2 (0.100): 0.011*"water" + 0.009*"improv" + 0.009*"use" + 0.009*"natur" + 0.009*"help" + 0.009*"environ" + 0.009*"new" + 0.008*"fish" + 0.008*"england" + 0.007*"page"
INFO:gensim.models.ldamodel:topic #8 (0.100): 0.011*"use" + 0.009*"govuk" + 0.009*"new" + 0.008*"help" + 0.008*"support" + 0.008*"page" + 0.008*"govern" + 0.008*"c

INFO:gensim.models.ldamodel:merging changes from 2000 documents into a model of 6340 documents
INFO:gensim.models.ldamodel:topic #4 (0.100): 0.027*"right" + 0.020*"plastic" + 0.019*"human" + 0.017*"un" + 0.011*"peac" + 0.011*"ambassador" + 0.011*"we" + 0.011*"secur" + 0.010*"unit" + 0.009*"continu"
INFO:gensim.models.ldamodel:topic #7 (0.100): 0.009*"de" + 0.006*"la" + 0.003*"plastic" + 0.003*"interreg" + 0.003*"ppp" + 0.003*"recherch" + 0.003*"rover" + 0.002*"brest" + 0.002*"douarnenez" + 0.002*"françai"
INFO:gensim.models.ldamodel:topic #2 (0.100): 0.012*"water" + 0.010*"environ" + 0.010*"improv" + 0.009*"use" + 0.009*"natur" + 0.009*"help" + 0.008*"fish" + 0.008*"new" + 0.008*"england" + 0.007*"govuk"
INFO:gensim.models.ldamodel:topic #9 (0.100): 0.027*"flood" + 0.012*"use" + 0.012*"govuk" + 0.011*"agenc" + 0.010*"page" + 0.010*"cooki" + 0.009*"help" + 0.009*"environ" + 0.009*"addit" + 0.008*"improv"
INFO:gensim.models.ldamodel:topic #8 (0.100): 0.011*"use" + 0.009*"new" + 0.009*"go

INFO:gensim.models.ldamodel:topic #9 (0.100): 0.028*"flood" + 0.011*"use" + 0.011*"govuk" + 0.011*"agenc" + 0.010*"page" + 0.009*"cooki" + 0.009*"help" + 0.008*"environ" + 0.008*"addit" + 0.008*"new"
INFO:gensim.models.ldamodel:topic #2 (0.100): 0.012*"water" + 0.010*"natur" + 0.009*"improv" + 0.009*"environ" + 0.009*"use" + 0.009*"help" + 0.009*"new" + 0.008*"england" + 0.008*"fish" + 0.007*"page"
INFO:gensim.models.ldamodel:topic diff=0.076325, rho=0.275554
INFO:gensim.models.ldamodel:-7.854 per-word bound, 231.4 perplexity estimate based on a held-out corpus of 340 documents with 217938 words
INFO:gensim.models.ldamodel:PROGRESS: pass 9, at document #6340/6340
INFO:gensim.models.ldamodel:merging changes from 340 documents into a model of 6340 documents
INFO:gensim.models.ldamodel:topic #4 (0.100): 0.028*"right" + 0.019*"human" + 0.017*"un" + 0.014*"ambassador" + 0.014*"plastic" + 0.012*"secur" + 0.012*"peac" + 0.012*"we" + 0.010*"unit" + 0.010*"continu"
INFO:gensim.models.ldamodel:t

In [129]:
%%capture
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

# Convert the gensim LDA model to a format that pyLDAvis can use
vis_data = gensimvis.prepare(lda_model, corpus, dictionary)

  -11.78310733]
 [ -5.88535026  -5.98754061  -6.53859889 ... -12.94350594 -11.82586693
  -11.7829373 ]
 [ -9.6083178   -8.52800827  -8.3673583  ... -13.261084   -11.8259431
  -11.78309779]
 ...
 [-16.24521357 -15.79260137 -15.20235736 ... -13.26312494 -11.82595156
  -11.78312544]
 [-16.24521357 -15.79260137 -15.20235736 ... -13.26312494 -11.82595156
  -11.78312544]
 [-16.24521357 -15.79260137 -15.20235736 ... -13.26312494 -11.82595156
  -3.5855171 ]
 [ 0.32064701  0.21845665 -0.33260163 ... -6.73750868 -5.61986966
  -5.57694003]
 [-0.63649155  0.44381798  0.60446795 ... -4.28925775 -2.85411686
  -2.81127155]
 ...
 [-0.9247816  -0.4721694   0.1180746  ...  2.05730703  3.4944804
   3.53730652]
 [-0.9247816  -0.4721694   0.1180746  ...  2.05730703  3.4944804
   3.53730652]
 [-0.9247816  -0.4721694   0.1180746  ...  2.05730703  3.4944804
  -11.78310733]
 [ -5.88535026  -5.98754061  -6.53859889 ... -12.94350594 -11.82586693
  -11.7829373 ]
 [ -9.6083178   -8.52800827  -8.3673583  ... -13.26

  -11.78310733]
 [ -5.88535026  -5.98754061  -6.53859889 ... -12.94350594 -11.82586693
  -11.7829373 ]
 [ -9.6083178   -8.52800827  -8.3673583  ... -13.261084   -11.8259431
  -11.78309779]
 ...
 [-16.24521357 -15.79260137 -15.20235736 ... -13.26312494 -11.82595156
  -11.78312544]
 [-16.24521357 -15.79260137 -15.20235736 ... -13.26312494 -11.82595156
  -11.78312544]
 [-16.24521357 -15.79260137 -15.20235736 ... -13.26312494 -11.82595156
  -3.5855171 ]
 [ 0.32064701  0.21845665 -0.33260163 ... -6.73750868 -5.61986966
  -5.57694003]
 [-0.63649155  0.44381798  0.60446795 ... -4.28925775 -2.85411686
  -2.81127155]
 ...
 [-0.9247816  -0.4721694   0.1180746  ...  2.05730703  3.4944804
   3.53730652]
 [-0.9247816  -0.4721694   0.1180746  ...  2.05730703  3.4944804
   3.53730652]
 [-0.9247816  -0.4721694   0.1180746  ...  2.05730703  3.4944804
  -11.78310733]
 [ -5.88535026  -5.98754061  -6.53859889 ... -12.94350594 -11.82586693
  -11.7829373 ]
 [ -9.6083178   -8.52800827  -8.3673583  ... -13.26

  -11.78310733]
 [ -5.88535026  -5.98754061  -6.53859889 ... -12.94350594 -11.82586693
  -11.7829373 ]
 [ -9.6083178   -8.52800827  -8.3673583  ... -13.261084   -11.8259431
  -11.78309779]
 ...
 [-16.24521357 -15.79260137 -15.20235736 ... -13.26312494 -11.82595156
  -11.78312544]
 [-16.24521357 -15.79260137 -15.20235736 ... -13.26312494 -11.82595156
  -11.78312544]
 [-16.24521357 -15.79260137 -15.20235736 ... -13.26312494 -11.82595156
  -3.5855171 ]
 [ 0.32064701  0.21845665 -0.33260163 ... -6.73750868 -5.61986966
  -5.57694003]
 [-0.63649155  0.44381798  0.60446795 ... -4.28925775 -2.85411686
  -2.81127155]
 ...
 [-0.9247816  -0.4721694   0.1180746  ...  2.05730703  3.4944804
   3.53730652]
 [-0.9247816  -0.4721694   0.1180746  ...  2.05730703  3.4944804
   3.53730652]
 [-0.9247816  -0.4721694   0.1180746  ...  2.05730703  3.4944804
  -11.78310733]
 [ -5.88535026  -5.98754061  -6.53859889 ... -12.94350594 -11.82586693
  -11.7829373 ]
 [ -9.6083178   -8.52800827  -8.3673583  ... -13.26

  -11.78310733]
 [ -5.88535026  -5.98754061  -6.53859889 ... -12.94350594 -11.82586693
  -11.7829373 ]
 [ -9.6083178   -8.52800827  -8.3673583  ... -13.261084   -11.8259431
  -11.78309779]
 ...
 [-16.24521357 -15.79260137 -15.20235736 ... -13.26312494 -11.82595156
  -11.78312544]
 [-16.24521357 -15.79260137 -15.20235736 ... -13.26312494 -11.82595156
  -11.78312544]
 [-16.24521357 -15.79260137 -15.20235736 ... -13.26312494 -11.82595156
  -3.5855171 ]
 [ 0.32064701  0.21845665 -0.33260163 ... -6.73750868 -5.61986966
  -5.57694003]
 [-0.63649155  0.44381798  0.60446795 ... -4.28925775 -2.85411686
  -2.81127155]
 ...
 [-0.9247816  -0.4721694   0.1180746  ...  2.05730703  3.4944804
   3.53730652]
 [-0.9247816  -0.4721694   0.1180746  ...  2.05730703  3.4944804
   3.53730652]
 [-0.9247816  -0.4721694   0.1180746  ...  2.05730703  3.4944804
  -11.78310733]
 [ -5.88535026  -5.98754061  -6.53859889 ... -12.94350594 -11.82586693
  -11.7829373 ]
 [ -9.6083178   -8.52800827  -8.3673583  ... -13.26

  -11.78310733]
 [ -5.88535026  -5.98754061  -6.53859889 ... -12.94350594 -11.82586693
  -11.7829373 ]
 [ -9.6083178   -8.52800827  -8.3673583  ... -13.261084   -11.8259431
  -11.78309779]
 ...
 [-16.24521357 -15.79260137 -15.20235736 ... -13.26312494 -11.82595156
  -11.78312544]
 [-16.24521357 -15.79260137 -15.20235736 ... -13.26312494 -11.82595156
  -11.78312544]
 [-16.24521357 -15.79260137 -15.20235736 ... -13.26312494 -11.82595156
  -3.5855171 ]
 [ 0.32064701  0.21845665 -0.33260163 ... -6.73750868 -5.61986966
  -5.57694003]
 [-0.63649155  0.44381798  0.60446795 ... -4.28925775 -2.85411686
  -2.81127155]
 ...
 [-0.9247816  -0.4721694   0.1180746  ...  2.05730703  3.4944804
   3.53730652]
 [-0.9247816  -0.4721694   0.1180746  ...  2.05730703  3.4944804
   3.53730652]
 [-0.9247816  -0.4721694   0.1180746  ...  2.05730703  3.4944804


In [130]:
pyLDAvis.display(vis_data)