<a href="https://colab.research.google.com/github/KristianMiok/Parliamentary-Discourse/blob/main/UK_Tot_Vizz_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# To be run only once
if 0 == 1:
    !pip install gensim
    !pip install PyLDAvis
    !pip install spacy
    !python -m spacy download en_core_web_sm

In [None]:
# Access to resources
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
# Read data from file
import pandas as pd
from sklearn.utils import shuffle

# Point to the file in Google Drive
# filename='/content/gdrive/My Drive/Topic Modeling/om.csv'
#filename='/content/gdrive/My Drive/EN_HS/big_dataset.csv'
df = pd.read_csv('/content/gdrive/My Drive/Text Summarization/UK/Data/total_80.txt', sep='\t',encoding="utf-8")

In [None]:
df.shape

(79997, 20)

In [None]:
df.columns

Index(['ID', 'Title', 'From', 'To', 'House', 'Term', 'Session', 'Meeting',
       'Sitting', 'Agenda', 'Subcorpus', 'Speaker_role', 'Speaker_type',
       'Speaker_party', 'Speaker_party_name', 'Party_status', 'Speaker_name',
       'Speaker_gender', 'Speaker_birth', 'V2'],
      dtype='object')

In [None]:
df['Speaker_role'].value_counts()

Regular    79997
Name: Speaker_role, dtype: int64

In [None]:
df1=df.loc[df['Speaker_type'] == "MP"]

In [None]:
df2=df1.loc[df1['Speaker_role'] == "Regular"]

In [None]:
df2.shape

(79997, 20)

In [None]:
papers=df2

In [None]:
# Cleaning!
# Load the regular expression library
import re

# Remove punctuation
papers['paper_text_processed'] = papers['V2'].map(lambda x: re.sub('[,\.!?]', '', x))

# Convert the titles to lowercase
papers['paper_text_processed'] = papers['paper_text_processed'].map(lambda x: x.lower())

# Print out the first rows of papers
papers['paper_text_processed'].head()

0    (urgent question): to ask the secretary of sta...
1    i thank the minister for that answer i presume...
2    my lords i declare my interests in the registe...
3    25 whether she plans to exclude international ...
4    i understand that people quite often want to b...
Name: paper_text_processed, dtype: object

In [None]:
import gensim
from gensim.utils import simple_preprocess

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data = papers.paper_text_processed.values.tolist()
data_words = list(sent_to_words(data))

print(data_words[:1][0][:30])

['urgent', 'question', 'to', 'ask', 'the', 'secretary', 'of', 'state', 'for', 'transport', 'if', 'he', 'will', 'make', 'statement', 'on', 'recent', 'changes', 'to', 'aviation', 'security']


In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)



In [None]:
# NLTK Stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words.extend(["say","time","want","know","friend","come","see","right","member","today","thing","may","year","week","put","last","make","leave","good","day","can","speak","great","place","thank","hear","must","way","go","think","debate","lord","member","question","plan","could","would","business","year",'people',"noble","say","government","lord","support","country","deal","work","pay","many","have", "give", "take", "make", "do", "get"])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [None]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
data_words_nostops = remove_stopwords(data_words)

In [None]:
type(data_words_nostops)

list

In [None]:
any("want" in w for w in data_words_nostops)

False

In [None]:
import spacy

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB'])

print(data_lemmatized[:1][0][:30])

['statement', 'recent', 'change', 'aviation', 'security']


In [None]:
data_lemmatized2 = remove_stopwords(data_lemmatized)

In [None]:
print(len(stop_words))
print(len(data_lemmatized)-len(data_lemmatized2))

235
0


In [None]:
len(data_lemmatized2)

79997

In [None]:
any("speak" in w for w in data_lemmatized)

True

In [None]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized2)

# Create Corpus
texts = data_lemmatized2

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]


In [None]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=5, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

In [None]:
print(lda_model)

LdaModel(num_terms=34481, num_topics=5, decay=0.5, chunksize=100)


In [None]:
!pip install pyLDAvis==2.1.2



In [None]:
import numpy as np
import tqdm
import pyLDAvis.gensim
import pickle 
import pyLDAvis
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word,sort_topics=False)
LDAvis_prepared

  from collections import Iterable


In [None]:
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric

lda_topics = lda_model.show_topics(num_topics=5,num_words=30)
lda_topics
# topics = []
# filters = [lambda x: x.lower(), strip_punctuation, strip_numeric]

# for topic in lda_topics:
#     print(topic)
#     topics.append(preprocess_string(topic[1], filters))

[(0,
  '0.011*"scheme" + 0.011*"need" + 0.009*"job" + 0.009*"money" + 0.008*"increase" + 0.008*"sector" + 0.008*"cost" + 0.007*"local" + 0.007*"new" + 0.007*"fund" + 0.006*"financial" + 0.006*"help" + 0.006*"public" + 0.006*"company" + 0.006*"benefit" + 0.006*"pandemic" + 0.006*"measure" + 0.006*"pension" + 0.006*"crisis" + 0.005*"economy" + 0.005*"high" + 0.005*"service" + 0.005*"small" + 0.005*"provide" + 0.005*"funding" + 0.005*"use" + 0.005*"rate" + 0.004*"transport" + 0.004*"worker" + 0.004*"look"'),
 (1,
  '0.014*"trade" + 0.010*"ensure" + 0.009*"important" + 0.009*"world" + 0.008*"need" + 0.008*"future" + 0.008*"new" + 0.008*"part" + 0.008*"continue" + 0.007*"opportunity" + 0.007*"include" + 0.006*"industry" + 0.006*"food" + 0.006*"agreement" + 0.006*"standard" + 0.005*"policy" + 0.005*"area" + 0.005*"international" + 0.005*"agree" + 0.005*"sector" + 0.005*"economic" + 0.005*"free" + 0.005*"market" + 0.005*"set" + 0.004*"nation" + 0.004*"environment" + 0.004*"commitment" + 0.004

In [None]:
pyLDAvis.save_html(LDAvis_prepared, '/content/gdrive/My Drive/Text Summarization/UK/Html/UK_tot5.html')