In [4]:
import requests

from math import log
from statistics import mean

# import numpy as np
import pandas as pd

import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

import gensim

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hayden\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Hayden\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Hayden\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Import and Preprocess Data

In [None]:
ga = {}
ga[95] = pd.Series(open("95.txt", "r").read().splitlines())
ga[96] = pd.Series(open("96.txt", "r").read().splitlines())

df = {}
for year in ga:
    df[year] = ga[year].to_frame(name="doc")
    df[year]["year"] = year
    df[year]["par"] = df[year].index.values.tolist()

df_f = pd.concat([df_i for df_i in df.values()], ignore_index=True)
df_f


Unnamed: 0,doc,year,par
0,"Let me start by offering you, Mr President, my...",95,0
1,"This is an historic Council, the first fully-e...",95,1
2,In my first policy address to the Legislative ...,95,2
3,The programme I launched in 1992 was built on ...,95,3
4,The first concerns the economy. Hong Kong know...,95,4
...,...,...,...
240,"For me the frustration, the greatest in this j...",96,92
241,My anxiety is this â”€ and I want to stress th...,96,93
242,And what of that truth which more than anythin...,96,94
243,"Hong Kong, it seems to me, has always lived by...",96,95


In [82]:
nltk_sw = nltk.corpus.stopwords.words('english')

def get_wordnet_tag(tag):
    tag_map = {
        "J": nltk.corpus.wordnet.ADJ,
        "N": nltk.corpus.wordnet.NOUN,
        "V": nltk.corpus.wordnet.VERB,
        "R": nltk.corpus.wordnet.ADV
    }
    return tag_map.get(tag[0].upper(), nltk.corpus.wordnet.NOUN)

def get_tokens(text):
    tokens = nltk.RegexpTokenizer("[\w']+").tokenize(text)
    tokens = nltk.pos_tag(tokens)
    tokens = [nltk.stem.WordNetLemmatizer().lemmatize(word, get_wordnet_tag(tag)) for word, tag in tokens]
    tokens = [token.lower() for token in tokens]
    tokens = [word for word in tokens if word not in nltk_sw]
    return tokens

def get_corpus(docs):
    return docs.apply(get_tokens)

# docs must be a list of lists of words
def get_stopwords(corpus, tfidf=False):
    words = {}
    for i, doc in enumerate(corpus):
        for word in doc:
            words[word] = words.get(word, {})
            words[word][i] = (words[word].get(i, 0) + 1) if tfidf else 1

    for word in words:
        if tfidf:
            tf = sum(words[word].values())
            df = len(words[word].values())
            tfidf = tf / df
            words[word] = tfidf
        else:
            words[word] = len(words[word].values()) / len(corpus)
    
    s = pd.Series(words)
    return s[s > .80].sort_values()

def remove_corpus_stopwords(corpus):
    sw = get_stopwords(corpus)
    print("Corpus specific stopwords: ")
    print(sw)
    return corpus.apply(lambda tokens: [word for word in tokens if word not in sw])

docs = df_f["doc"]
corpus = remove_corpus_stopwords(get_corpus(docs))
dictionary = gensim.corpora.Dictionary(corpus)
word_freq = [dictionary.doc2bow(word) for word in corpus]

Corpus specific stopwords: 
Series([], dtype: float64)


In [99]:
corpus = remove_corpus_stopwords(get_corpus(docs))
dictionary = gensim.corpora.Dictionary(corpus)
word_freq = [dictionary.doc2bow(word) for word in corpus]
lda_model = gensim.models.ldamodel.LdaModel(corpus=word_freq,
                                            id2word=dictionary,
                                            num_topics=20,
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha="auto",
                                            per_word_topics=True)

topics = []
doc_topic_probs = lda_model.get_document_topics(word_freq)
for index, topic_probs in enumerate(doc_topic_probs):
    topic = sorted(topic_probs, key= lambda x: -x[1])[0][0]
    topics.append(topic)

df_f["topic"] = topics

    

Corpus specific stopwords: 
Series([], dtype: float64)


In [100]:
df_f

Unnamed: 0,doc,year,par,topic
0,"Let me start by offering you, Mr President, my...",95,0,1
1,"This is an historic Council, the first fully-e...",95,1,1
2,In my first policy address to the Legislative ...,95,2,12
3,The programme I launched in 1992 was built on ...,95,3,6
4,The first concerns the economy. Hong Kong know...,95,4,0
...,...,...,...,...
240,"For me the frustration, the greatest in this j...",96,92,1
241,My anxiety is this â”€ and I want to stress th...,96,93,1
242,And what of that truth which more than anythin...,96,94,1
243,"Hong Kong, it seems to me, has always lived by...",96,95,9


In [121]:
import plotly.express as px
px_df = pd.DataFrame(columns=["year", "prop", "topic"])
for year in 95, 96:
    for topic in range(20):
        total = df_f[df_f["year"] == year]
        other = total[total["topic"] == topic]
        num = len(other)
        denom = len(total)
        prop = num / denom

        dummy = pd.DataFrame({"year": year, "prop": prop, "topic": topic}, index=[0])
        px_df = pd.concat([px_df, dummy], ignore_index=True)

px.bar(px_df, x='year', y='prop', color='topic')