In [2]:
# !python -m pip install corextopic
# !python -m pip install networkx

In [31]:
import matplotlib.pyplot as plt
import pandas as pd 
import re
from sklearn.feature_extraction.text import CountVectorizer
from corextopic import corextopic as ct
from corextopic import vis_topic as vt # jupyter notebooks will complain matplotlib is being loaded twice
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import scipy.sparse as ss
%matplotlib inline

In [90]:
newline = []

textstring = ''
with open('Hansard_preCovid.csv', 'r', encoding='utf-16-le') as inp, open('newtextfile.txt', 'w') as out:
    for line in inp:
      newline.append(re.findall(r'"(.*)"', line))
      
temp = pd.Series(newline)

initialDF_preCovid = pd.DataFrame({'Text': temp})
clearWords = stopwords.words('english')
new_Stopwords = ['in', 'the', 'said', 'like','must', 'many', 'also']
clearWords.extend(new_Stopwords)
lemma= WordNetLemmatizer()
initialDF_preCovid['Text'] = initialDF_preCovid['Text'].astype(str).str.lower()
initialDF_preCovid['Text'] = initialDF_preCovid['Text'].str.replace("|".join([r"^.*?speaker,", r"^.*?chair," ]), ' ', regex=True)
initialDF_preCovid['text_tokens'] = initialDF_preCovid['Text'].apply(word_tokenize)
initialDF_preCovid['text_tokens'] = initialDF_preCovid['text_tokens'].apply(lambda x: [item for item in x if item not in clearWords])
initialDF_preCovid['text_tokens'] = initialDF_preCovid['text_tokens'].apply(lambda x: [lemma.lemmatize(item) for item in x])
# do we love me or what? isn't this join thing amazing?
initialDF_preCovid['text_tokens'] = initialDF_preCovid['text_tokens'].apply(lambda x:' '.join([item for item in x if len(item)>3]))

The topic model assumes input is in the form of a doc-word matrix, where rows are documents and columns are binary counts. We'll vectorize our data, take the top 20,000 words, and convert it to a sparse matrix to save on memory usage. Note, we use binary count vectors as input to the CorEx topic model.

In [91]:
vectorizer = CountVectorizer(stop_words='english', max_features=20000, binary=True)
doc_word = vectorizer.fit_transform(initialDF_preCovid['text_tokens'].astype(str))
doc_word = ss.csr_matrix(doc_word)
doc_word.shape # n_docs x m_words


(305, 7778)

In [92]:
import numpy as np
# Get words that label the columns (needed to extract readable topics and make anchoring easier)
words = list(np.asarray(vectorizer.get_feature_names_out()))
not_digit_inds = [ind for ind,word in enumerate(words) if not word.isdigit()]
doc_word = doc_word[:,not_digit_inds]
words    = [word for ind,word in enumerate(words) if not word.isdigit()]

doc_word.shape # n_docs x m_words


(305, 7396)

In [43]:
# Train the CorEx topic model with 50 topics
topic_model = ct.Corex(n_hidden=45, words=words, max_iter=1200, verbose=False, seed=11)
topic_model.fit(doc_word, words=words);


In [89]:
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: secretary,parliamentary,original,basis,comment,cent,remark,perspective,extensive,feedback
1: citizenship,resident,refugee,immigration,visa,application,status,improving,permit,recommendation
2: started,general,covered,exactly,caused,successive,discussed,transit,limited,steel
3: important,want,issue,canada,right,house,thing,member,debate,today
4: income,excluding,attendee,collected,output,summary,household,breach,bank,secret
5: support,community,opportunity,economic,life,people,work,help,come,social
6: mean,including,cost,provide,process,public,current,individual,benefit,government
7: event,january,breakdown,visitor,july,list,disappointed,assigned,recipient,opinion
8: witness,stated,brief,reference,recognized,oversight,wonder,task,dream,investigate
9: market,relationship,domestic,implementation,consultation,partnership,operation,force,statement,affair
10: address,funding,high,rate,toronto,young,common,addition,share,greater
11: riding,spending,heritage,culture,associated,energy,relati

In [59]:
topic_model.get_topics(topic=28, n_words=60)

[('atlantic', 0.2118757731733022, 1.0),
 ('attract', 0.16952692055457777, 1.0),
 ('pilot', 0.16869213595442475, 1.0),
 ('immigrant', 0.14768737249103145, 1.0),
 ('newcomer', 0.1394224895690516, 1.0),
 ('economy', 0.13398757166665184, 1.0),
 ('skill', 0.1326815412570898, 1.0),
 ('succeed', 0.12517829234269348, 1.0),
 ('success', 0.12402910753850159, 1.0),
 ('strategy', 0.09858500971286509, 1.0),
 ('integrate', 0.09833859800300102, 1.0),
 ('successful', 0.08976984697373606, 1.0),
 ('skilled', 0.07629734250329366, 1.0),
 ('settle', 0.07416477287683357, 1.0),
 ('demographic', 0.0728065191219305, 1.0),
 ('update', 0.06813023204908782, 1.0),
 ('retain', 0.0648212125685644, 1.0),
 ('candidate', 0.06377351682295165, 1.0),
 ('talent', 0.05371150847705499, 1.0),
 ('driven', 0.04697691300293009, 1.0),
 ('reflects', 0.04697691300293009, 1.0),
 ('attracting', 0.04697691300293009, 1.0),
 ('stable', 0.04697606334236756, 1.0),
 ('changing', 0.04226155750538222, 1.0),
 ('broader', 0.041841002732485276,

In [95]:
anchor_words = [['worker', 'season', 'migrant', 'temporary']]
anchored_topic_modelt = ct.Corex(n_hidden=45, seed=10)
anchored_topic_modelt.fit(doc_word, words=words, anchors=anchor_words, anchor_strength=3)
for n in range(len(anchor_words)):
    topic_words,_,_ = zip(*anchored_topic_modelt.get_topics(topic=n))
    print('{} '.format(n) + ','.join(topic_words))


0 important,worker,country,including,national,canada,issue,cost,temporary,member


In [97]:
anchored_topic_modelt.get_topics(topic=0, n_words=100)

[('important', 0.3279770217456792, 1.0),
 ('worker', 0.3077435214220086, 1.0),
 ('country', 0.26203902728432243, 1.0),
 ('including', 0.25133926392305384, 1.0),
 ('national', 0.23786152180915576, 1.0),
 ('canada', 0.23007565540192806, 1.0),
 ('issue', 0.22830443586014124, 1.0),
 ('cost', 0.21909320644613098, 1.0),
 ('temporary', 0.21835984967742322, 1.0),
 ('member', 0.2163216584236027, 1.0),
 ('think', 0.21154574598563092, 1.0),
 ('house', 0.20963730869874195, 1.0),
 ('debate', 0.20793139471617786, 1.0),
 ('able', 0.20694796379720495, 1.0),
 ('canadian', 0.20536092176354057, 1.0),
 ('public', 0.20437173497488795, 1.0),
 ('reason', 0.2006563235347972, 1.0),
 ('benefit', 0.19084903190669306, 1.0),
 ('better', 0.19071994332578557, 1.0),
 ('term', 0.18559787299445824, 1.0),
 ('order', 0.1833953790058322, 1.0),
 ('access', 0.18339530930989992, 1.0),
 ('thing', 0.1823808387908236, 1.0),
 ('know', 0.1742817502271898, 1.0),
 ('business', 0.1734320319397817, 1.0),
 ('province', 0.1717248481927

mid covid

In [48]:
initialDF_midCovid= pd.read_csv('Hansard_mid_covid.csv', sep=',', encoding='utf-16-le')
initialDF_midCovid.drop(['Publication', 'First Name', 'Last Name','Constituency', 'Province', 'Date', 'Time', 'Page'], axis=1)
clearWords = stopwords.words('english')
new_Stopwords = ['in', 'the', 'said', 'like','must', 'many', 'also']
clearWords.extend(new_Stopwords)
lemma= WordNetLemmatizer()

initialDF_midCovid['Text'] = initialDF_midCovid['Text'].astype(str).str.lower()
initialDF_midCovid['Text'] = initialDF_midCovid['Text'].str.replace("|".join([r"^.*?speaker,", r"^.*?chair," ]), ' ', regex=True)
initialDF_midCovid['text_tokens'] = initialDF_midCovid['Text'].apply(word_tokenize)
initialDF_midCovid['text_tokens'] = initialDF_midCovid['text_tokens'].apply(lambda x: [item for item in x if item not in clearWords])
initialDF_midCovid['text_tokens'] = initialDF_midCovid['text_tokens'].apply(lambda x: [lemma.lemmatize(item) for item in x])
# do we love me or what? isn't this join thing amazing?
initialDF_midCovid['text_tokens'] = initialDF_midCovid['text_tokens'].apply(lambda x:' '.join([item for item in x if len(item)>3]))

In [49]:
vectorizer = CountVectorizer(stop_words='english', max_features=20000, binary=True)
doc_word = vectorizer.fit_transform(initialDF_midCovid['text_tokens'].astype(str))
doc_word = ss.csr_matrix(doc_word)
doc_word.shape # n_docs x m_words

# Get words that label the columns (needed to extract readable topics and make anchoring easier)
words = list(np.asarray(vectorizer.get_feature_names_out()))
not_digit_inds = [ind for ind,word in enumerate(words) if not word.isdigit()]
doc_word = doc_word[:,not_digit_inds]
words    = [word for ind,word in enumerate(words) if not word.isdigit()]


In [75]:
midcovid_unsup =  ct.Corex(n_hidden=60, words=words, max_iter=1200, verbose=False, seed=7)
midcovid_unsup.fit(doc_word, words=words);



In [None]:
topics = midcovid_unsup.get_topics()
for n,topic in enumerate(topics):
    topic_words,_,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

In [None]:
midcovid_unsup.get_topics(topic=55, n_words=100)

In [87]:
anchor_words = [['workers', 'season', 'migrant']]
anchored_midCovidt= ct.Corex(n_hidden=45, seed=10)
anchored_midCovidt.fit(doc_word, words=words, anchors=anchor_words, anchor_strength=3)
for n in range(len(anchor_words)):
    topic_words,_,_ = zip(*anchored_midCovidt.get_topics(topic=n))
    print('{} '.format(n) + ','.join(topic_words))

0 migrant,workers,question,season,rising,proposes,floor,flexible,reminded,middle


In [88]:
anchored_midCovidt.get_topics(topic =0, n_words=100)

[('migrant', 0.2557059239256563, 1.0),
 ('workers', 0.16562050347123664, 1.0),
 ('question', 0.09220944512256433, 1.0),
 ('season', 0.07535253315843152, 1.0),
 ('rising', 0.07113384842447092, 1.0),
 ('proposes', 0.07113384842447092, 1.0),
 ('floor', 0.07113384842447092, 1.0),
 ('flexible', 0.06664869458465815, 1.0),
 ('reminded', 0.058501999858209294, 1.0),
 ('middle', 0.058501999858209294, 1.0),
 ('inclusion', 0.058501999858209294, 1.0),
 ('experienced', 0.058501999858209294, 1.0),
 ('communities', 0.058501999858209294, 1.0),
 ('legislative', 0.058501999858209294, 1.0),
 ('behaviour', 0.058501999858209294, 1.0),
 ('abuse', 0.058501999858209294, 1.0),
 ('airline', 0.058501999858209294, 1.0),
 ('river', 0.04605090308477814, 1.0),
 ('achieve', 0.04605090308477814, 1.0),
 ('section', 0.04605090308477814, 1.0),
 ('dangerous', 0.04605090308477814, 1.0),
 ('born', 0.04605090308477814, 1.0),
 ('dying', 0.04605090308477814, 1.0),
 ('permanently', 0.04605090308477814, 1.0),
 ('trend', 0.0337738