# Topic modelling
## Unsupervised learning of topics in a text
### using Latent Dirchlet Allocation (via sklearn)
Topic modelling can be thought of as dimensionality reduction:  
Documents are represented as sets of topics  
Each topic has a weight

In [22]:
import re
import pandas as pd
import sklearn
import csv
import nltk
import string
from nltk.stem.snowball import SnowballStemmer
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
# use CountVectorizer to turn the docs into vectors
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

In [3]:
# create the stemmer
stemmer = SnowballStemmer('english')

In [8]:
# helper functions
stopwords_file_path = "stopwords.csv"

def read_in_csv(csv_file):
    with open(csv_file, 'r', encoding='utf-8') as fp:
        reader = csv.reader(fp, delimiter=',', quotechar='"')
        data_read = [row for row in reader]
    return data_read

def get_stopwords(path=stopwords_file_path):
    stopwords = read_in_csv(path)
    stopwords = [word[0] for word in stopwords]
    stemmed_stopwords = [stemmer.stem(word) for word in stopwords]
    stopwords = stopwords + stemmed_stopwords
    return stopwords

def tokenize_and_stem(sentence):
    tokens = nltk.word_tokenize(sentence)
    filtered_tokens = [t for t in tokens if t not in stopwords and t not in string.punctuation and re.search('[a-zA-Z]', t)]
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [18]:
# if you're on colab upload the data files
from google.colab import files
uploaded = files.upload()

Saving bbc-text.csv to bbc-text.csv
Saving stopwords.csv to stopwords.csv


In [19]:
# read in our data
stopwords = get_stopwords(stopwords_file_path)
bbc_dataset = "bbc-text.csv"

### We’ll use a public dataset from the BBC comprised of 2,225 articles  
Each labeled under one of 5 categories: business, entertainment, politics, sport or tech

In [10]:
# turn the documents into vectors
def create_count_vectorizer(documents):
    count_vectorizer = CountVectorizer(stop_words=stopwords, tokenizer=tokenize_and_stem, max_features=1500)
    data = count_vectorizer.fit_transform(documents)
    return (count_vectorizer, data)

In [11]:
# remove unwanted characters (keep just words and spaces)
def clean_data(df):
    df['text'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))
    df['text'] = df['text'].apply(lambda x: re.sub(r'\d', '', x))
    return df

In [12]:
# create the LDA model (note that usually num_topics is unknown)
def create_and_fit_lda(data, num_topics):
    lda = LDA(n_components=num_topics, n_jobs=-1)
    lda.fit(data)
    return lda

In [13]:
# identify & print the most common topic words
def get_most_common_words_for_topics(model, vectorizer, n_top_words):
    words = vectorizer.get_feature_names()
    word_dict = {}
    for topic_index, topic in enumerate(model.components_):
        this_topic_words = [words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        word_dict[topic_index] = this_topic_words
    return word_dict

def print_topic_words(word_dict):
    for key in word_dict.keys():
        print(f"Topic {key}")
        print("\t", word_dict[key])

In [20]:
# read in the data, clean it, get text
df = pd.read_csv(bbc_dataset)
df = clean_data(df)
documents = df['text']

# set number of topics (note that usually this is unknown)
number_topics = 5

In [23]:
# create vectorizer & model
(vectorizer, data) = create_count_vectorizer(documents)
lda = create_and_fit_lda(data, number_topics)



In [24]:
# inspect the contents of the topics
topic_words = get_most_common_words_for_topics(lda, vectorizer, 10)
print_topic_words(topic_words)

Topic 0
	 ['use', 'peopl', 'game', 'mobil', 'technolog', 'phone', 'servic', 'music', 'new', 'user']
Topic 1
	 ['govern', 'countri', 'peopl', 'new', 'year', 'say', 'work', 'report', 'world', 'uk']
Topic 2
	 ['year', 'm', 'film', 'play', 'best', 'game', 'win', 'first', 'time', 'award']
Topic 3
	 ['labour', 'elect', 'parti', 'say', 'blair', 'govern', 'minist', 'tori', 'peopl', 'brown']
Topic 4
	 ['year', 'bn', 'compani', 'm', 'market', 'firm', 'sale', 'price', 'share', 'bank']




In [27]:
def test_new_example(lda, vect, example):
    vectorized = vect.transform([example])
    topic = lda.transform(vectorized)
    print(topic)
    return topic

In [26]:
# bbc news article
new_example = """Gareth Southgate says England's situation is "more complicated than any other country" after announcing a 33-man provisional squad for Euro 2020.
The England manager must name a 26-man squad by 1 June.
Manchester United, Manchester City and Chelsea play European finals this week.
"There are 12 players still to play so we're always going to need additional players and added to that we have some injuries at different stages, that we have very little info about," he said.
"We felt more time will help us make better decisions. Our preference was to name the 26, but we have not got an ideal hand of cards - a lot of unknowns.
"Info and evidence are very important and we will have a lot more in the next seven days."
White and Godfrey among uncapped quartet in 33-man provisional England squad"""

In [29]:
# one of my blog posts
new_example = """Last week I was honored and privileged to present at the ISKO UK meetup on the topic of ‘Searching, fast and slow’. This talk was a slightly updated version of the one I gave at Search Solutions 2020, in which I presented the case for a transformation of the systematic searching paradigm from the attributes on the left (which perpetuate ‘slow thinking’) to the attributes on the right (which facilitate ‘fast thinking’):
Procedural → Declarative
Static → Interactive
Monolithic → Executable
Strings → Objects
In that respect this talk aligns with the argument I presented at Search Solutions, but what is novel this time was the discussion afterwards: in particular, the suggestion that we could take the analogy further by exploring other software engineering concepts and practices that could inject further rigour, transparency and reproducibility into the systematic search/review process. What follows isn’t meant to be an exhaustive list, but rather an initial set, along with my thoughts on how they might potentially be applied (or have existing parallels) in the world of structured searching:
Concept	Definition	Systematic searching equivalent
Design patterns	General, reusable solutions to commonly occurring problems	Facet analysis schemas? E.g. PICO, SPICE, SPIDER, CIMO etc.
Continuous integration	The practice where members of a team integrate their work frequently into a shared master copy	Living systematic reviews? The review is updated frequently, and usually published as online-only 
Git	Tooling for version control: tracking changes in files, coordinating work among team members	Currently no direct equivalent, but search strategies would benefit from version control and auditability
Github	Internet hosting for development and version control	PRESS forum? Currently no direct equivalent, but search strategy development would benefit from open access, cloud-based, reproducible solutions
Containers	A way to package up code and its dependencies so the application runs reliably from one environment to another	Currently no direct equivalent, but search strategies would benefit from portability across databases
StackOverflow	A community question and answer site to share solutions and best practices	Expert searching mailing list? ISSG search filters resource? Currently no direct equivalent, but search strategy development would benefit from reusable solutions and knowledge sharing
Feel free to add to this list, they are just my initial thoughts. I should also give credit to the ISKO community for being the catalyst for this: I think it’s an intriguing thread, and one to which I will give further thought. In the meantime, my slides are attached below."""

In [30]:
test_new_example(lda, vectorizer, new_example)

[[0.68228456 0.31348018 0.00142233 0.00141268 0.00140025]]


array([[0.68228456, 0.31348018, 0.00142233, 0.00141268, 0.00140025]])

### Try different values of N