# 🗣️ Topic Modelling: Going Beyond Tokens

Applying keyword extraction techniques alongside topic modelling in order to assign topics with meaningful names.

In [16]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import ngrams
from rake_nltk import Rake
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [17]:
# read data

df = pd.read_csv('research_paper_titles.csv', header = None)

display(df.head(10))

Unnamed: 0,0
0,Innovation in Database Management: Computer Sc...
1,High performance prime field multiplication fo...
2,enchanted scissors: a scissor interface for su...
3,Detection of channel degradation attack by Int...
4,Pinning a Complex Network through the Betweenn...
5,Analysis and Design of Memoryless Interconnect...
6,Dynamic bluescreens.
7,A Quantitative Assured Forwarding Service.
8,Automatic sanitization of social network data ...
9,A &#916;&#931; IR-UWB radar with sub-mm rangin...


In [18]:
# case text as lowercase, remove punctuation, remove extra whitespace in string and on both sides of string

df[1] = df[0].str.lower().str.replace('[^\w\s]', ' ').str.replace(' +', ' ').str.strip()

display(df.head(10))

Unnamed: 0,0,1
0,Innovation in Database Management: Computer Sc...,innovation in database management computer sci...
1,High performance prime field multiplication fo...,high performance prime field multiplication fo...
2,enchanted scissors: a scissor interface for su...,enchanted scissors a scissor interface for sup...
3,Detection of channel degradation attack by Int...,detection of channel degradation attack by int...
4,Pinning a Complex Network through the Betweenn...,pinning a complex network through the betweenn...
5,Analysis and Design of Memoryless Interconnect...,analysis and design of memoryless interconnect...
6,Dynamic bluescreens.,dynamic bluescreens
7,A Quantitative Assured Forwarding Service.,a quantitative assured forwarding service
8,Automatic sanitization of social network data ...,automatic sanitization of social network data ...
9,A &#916;&#931; IR-UWB radar with sub-mm rangin...,a 916 931 ir uwb radar with sub mm ranging cap...


In [19]:
# tokenise string

df[1] = df.apply(lambda row: nltk.word_tokenize(row[1]), axis=1)

display(df.head(10))

Unnamed: 0,0,1
0,Innovation in Database Management: Computer Sc...,"[innovation, in, database, management, compute..."
1,High performance prime field multiplication fo...,"[high, performance, prime, field, multiplicati..."
2,enchanted scissors: a scissor interface for su...,"[enchanted, scissors, a, scissor, interface, f..."
3,Detection of channel degradation attack by Int...,"[detection, of, channel, degradation, attack, ..."
4,Pinning a Complex Network through the Betweenn...,"[pinning, a, complex, network, through, the, b..."
5,Analysis and Design of Memoryless Interconnect...,"[analysis, and, design, of, memoryless, interc..."
6,Dynamic bluescreens.,"[dynamic, bluescreens]"
7,A Quantitative Assured Forwarding Service.,"[a, quantitative, assured, forwarding, service]"
8,Automatic sanitization of social network data ...,"[automatic, sanitization, of, social, network,..."
9,A &#916;&#931; IR-UWB radar with sub-mm rangin...,"[a, 916, 931, ir, uwb, radar, with, sub, mm, r..."


In [20]:
# initiate stopwords from nltk

stop_words = stopwords.words('english')

# add additional missing terms

stop_words.extend(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l','m','n','o','p','q','r','s','t', 'u', 'v', 'w', 'x', 'y', 'z', "about", "across", "after", "all", "also", "an", "and", "another", "added",
"any", "are", "as", "at", "basically", "be", "because", 'become', "been", "before", "being", "between","both", "but", "by","came","can","come","could","did","do","does","each","else","every","either","especially", "for","from","get","given","gets",
'give','gives',"got","goes","had","has","have","he","her","here","him","himself","his","how","if","in","into","is","it","its","just","lands","like","make","making", "made", "many","may","me","might","more","most","much","must","my","never","provide", 
"provides", "perhaps","no","now","of","on","only","or","other", "our","out","over","re","said","same","see","should","since","so","some","still","such","seeing", "see", "take","than","that","the","their","them","then","there",
"these","they","this","those","through","to","too","under","up","use","using","used", "underway", "very","want","was","way","we","well","were","what","when","where","which","while","whilst","who","will","with","would","you","your", 
'etc', 'via', 'eg']) 

# remove stopwords

df[1] = df[1].apply(lambda x: [item for item in x if item not in stop_words])

display(df.head(10))

Unnamed: 0,0,1
0,Innovation in Database Management: Computer Sc...,"[innovation, database, management, computer, s..."
1,High performance prime field multiplication fo...,"[high, performance, prime, field, multiplicati..."
2,enchanted scissors: a scissor interface for su...,"[enchanted, scissors, scissor, interface, supp..."
3,Detection of channel degradation attack by Int...,"[detection, channel, degradation, attack, inte..."
4,Pinning a Complex Network through the Betweenn...,"[pinning, complex, network, betweenness, centr..."
5,Analysis and Design of Memoryless Interconnect...,"[analysis, design, memoryless, interconnect, e..."
6,Dynamic bluescreens.,"[dynamic, bluescreens]"
7,A Quantitative Assured Forwarding Service.,"[quantitative, assured, forwarding, service]"
8,Automatic sanitization of social network data ...,"[automatic, sanitization, social, network, dat..."
9,A &#916;&#931; IR-UWB radar with sub-mm rangin...,"[916, 931, ir, uwb, radar, sub, mm, ranging, c..."


In [21]:
# initiate nltk lemmatiser

wordnet_lemmatizer = WordNetLemmatizer()

# lemmatise words

df[1] = df[1].apply(lambda x: [wordnet_lemmatizer.lemmatize(y) for y in x]) 

display(df.head(10))

Unnamed: 0,0,1
0,Innovation in Database Management: Computer Sc...,"[innovation, database, management, computer, s..."
1,High performance prime field multiplication fo...,"[high, performance, prime, field, multiplicati..."
2,enchanted scissors: a scissor interface for su...,"[enchanted, scissors, scissor, interface, supp..."
3,Detection of channel degradation attack by Int...,"[detection, channel, degradation, attack, inte..."
4,Pinning a Complex Network through the Betweenn...,"[pinning, complex, network, betweenness, centr..."
5,Analysis and Design of Memoryless Interconnect...,"[analysis, design, memoryless, interconnect, e..."
6,Dynamic bluescreens.,"[dynamic, bluescreens]"
7,A Quantitative Assured Forwarding Service.,"[quantitative, assured, forwarding, service]"
8,Automatic sanitization of social network data ...,"[automatic, sanitization, social, network, dat..."
9,A &#916;&#931; IR-UWB radar with sub-mm rangin...,"[916, 931, ir, uwb, radar, sub, mm, ranging, c..."


In [22]:
# initialise the count vectorizer

vectorizer = CountVectorizer(analyzer = 'word', ngram_range = (1, 2))
                            
# join the processed data to be vectorised

vectors = []

for index, row in df.iterrows():
    vectors.append(", ".join(row[1]))

vectorised = vectorizer.fit_transform(vectors)

print(vectorised)

  (0, 8018)	1
  (0, 4092)	1
  (0, 9224)	1
  (0, 3321)	1
  (0, 13901)	1
  (0, 5656)	1
  (0, 8020)	1
  (0, 4112)	1
  (0, 9227)	1
  (0, 3329)	1
  (0, 13903)	1
  (1, 7361)	1
  (1, 11595)	1
  (1, 12200)	1
  (1, 6240)	1
  (1, 10331)	1
  (1, 7062)	1
  (1, 7380)	1
  (1, 11621)	1
  (1, 12201)	1
  (1, 6247)	1
  (1, 10333)	1
  (2, 5582)	1
  (2, 13911)	1
  (2, 13909)	1
  :	:
  (2504, 4458)	1
  (2504, 16160)	1
  (2504, 4108)	1
  (2504, 6052)	1
  (2504, 6053)	1
  (2504, 8196)	1
  (2504, 4555)	1
  (2505, 6660)	1
  (2505, 2608)	1
  (2505, 16288)	1
  (2505, 13932)	1
  (2505, 10991)	1
  (2505, 1308)	1
  (2505, 2609)	1
  (2505, 10853)	1
  (2505, 16293)	1
  (2505, 10855)	1
  (2505, 13935)	1
  (2505, 10992)	1
  (2505, 6664)	1
  (2506, 12878)	1
  (2506, 16264)	1
  (2506, 4699)	1
  (2506, 16267)	1
  (2506, 12880)	1


In [23]:
# initisalise LDA Model

lda_model = LatentDirichletAllocation(n_components = 10, # number of topics
                                  random_state = 10,          # random state
                                  evaluate_every = -1,      # compute perplexity every n iters, default: Don't
                                  n_jobs = -1,              # Use all available CPUs
                                 )

lda_output = lda_model.fit_transform(vectorised)

# column names

topic_names = ["Topic" + str(i) for i in range(1, lda_model.n_components + 1)]

# make the pandas dataframe

df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns = topic_names)

# get dominant topic for each document

dominant_topic = (np.argmax(df_document_topic.values, axis=1)+1)
df_document_topic['Dominant_topic'] = dominant_topic

# join to original dataframes

df = pd.merge(df, df_document_topic, left_index = True, right_index = True, how = 'outer')
display(df.head(10))

Unnamed: 0,0,1,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Dominant_topic
0,Innovation in Database Management: Computer Sc...,"[innovation, database, management, computer, s...",0.92,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,1
1,High performance prime field multiplication fo...,"[high, performance, prime, field, multiplicati...",0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.92,0.01,0.01,8
2,enchanted scissors: a scissor interface for su...,"[enchanted, scissors, scissor, interface, supp...",0.01,0.94,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,2
3,Detection of channel degradation attack by Int...,"[detection, channel, degradation, attack, inte...",0.01,0.01,0.01,0.94,0.01,0.01,0.01,0.01,0.01,0.01,4
4,Pinning a Complex Network through the Betweenn...,"[pinning, complex, network, betweenness, centr...",0.01,0.01,0.01,0.01,0.01,0.92,0.01,0.01,0.01,0.01,6
5,Analysis and Design of Memoryless Interconnect...,"[analysis, design, memoryless, interconnect, e...",0.01,0.01,0.01,0.92,0.01,0.01,0.01,0.01,0.01,0.01,4
6,Dynamic bluescreens.,"[dynamic, bluescreens]",0.03,0.77,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,2
7,A Quantitative Assured Forwarding Service.,"[quantitative, assured, forwarding, service]",0.01,0.01,0.01,0.01,0.89,0.01,0.01,0.01,0.01,0.01,5
8,Automatic sanitization of social network data ...,"[automatic, sanitization, social, network, dat...",0.01,0.01,0.01,0.01,0.01,0.01,0.94,0.01,0.01,0.01,7
9,A &#916;&#931; IR-UWB radar with sub-mm rangin...,"[916, 931, ir, uwb, radar, sub, mm, ranging, c...",0.0,0.0,0.0,0.0,0.0,0.0,0.97,0.0,0.0,0.0,7


In [24]:
keywords = np.array(vectorizer.get_feature_names())

topic_keywords = []

for topic_weights in lda_model.components_:
    top_keyword_locs = (-topic_weights).argsort()[:20]
    topic_keywords.append(keywords.take(top_keyword_locs))

print(topic_keywords)

[array(['based', 'web', 'network', 'data', 'efficient', 'system', 'design',
       'power', 'converter', 'video', 'filter', 'semantic', 'application',
       'query', 'analysis', 'database', 'low', 'model', 'control', 'high'],
      dtype='<U32'), array(['network', 'data', 'based', 'system', 'analysis', 'high', 'power',
       'search', 'dynamic', 'wireless', 'aware', 'voltage', 'low',
       'level', 'time', 'current', 'linear', 'mode', 'approach',
       'algorithm'], dtype='<U32'), array(['network', 'based', 'design', 'application', 'model', 'wireless',
       'search', 'efficient', 'multi', 'video', 'time', 'sensor', 'data',
       'architecture', 'analysis', 'coding', 'high', 'query', 'cmos',
       'web'], dtype='<U32'), array(['network', 'ad', 'hoc', 'ad hoc', 'hoc network', 'data', 'based',
       'algorithm', 'wireless', 'system', 'design', 'multi', 'analysis',
       'query', 'low', 'routing', 'multiple', 'sensor', 'control', 'bit'],
      dtype='<U32'), array(['peer', 'netwo

In [25]:
# 