In [None]:
!pip install contextualized-topic-models==2.3.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contextualized-topic-models==2.3.0
  Downloading contextualized_topic_models-2.3.0-py2.py3-none-any.whl (35 kB)
Collecting gensim>=3.8.3
  Downloading gensim-4.2.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 1.5 MB/s 
Collecting ipywidgets==7.5.1
  Downloading ipywidgets-7.5.1-py2.py3-none-any.whl (121 kB)
[K     |████████████████████████████████| 121 kB 66.0 MB/s 
Collecting ipython==7.16.3
  Downloading ipython-7.16.3-py3-none-any.whl (783 kB)
[K     |████████████████████████████████| 783 kB 42.2 MB/s 
Collecting sentence-transformers>=1.1.1
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 563 kB/s 
Collecting jedi<=0.17.2,>=0.10
  Downloading jedi-0.17.2-py2.py3-none-any.whl (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 49.2 MB/s 


In [None]:
%%capture
!pip install pyldavis

In [None]:
import re
import urllib
import gzip
import io
import csv
import random
from collections import defaultdict
from tqdm import tqdm
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessingStopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
path_before_1990 = '/content/drive/My Drive/titles_before_1990.txt'
path_from_1990_to_2009 = '/content/drive/My Drive/titles_from_1990_to_2009.txt'
path_from_2010 = '/content/drive/My Drive/titles_from_2010.txt'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# to download the data manually or get more information, go to: https://dblp.org/faq/How+can+I+download+the+whole+dblp+dataset.html
url = 'https://dblp.uni-trier.de/xml/dblp.xml.gz'
# num_titles = 500000  # the (max)number of titles to load 


def load_gzip_file(url):
    """Download Gzip-file."""
    response = urllib.request.urlopen(url)
    compressed_file = io.BytesIO(response.read())
    decompressed_file = gzip.GzipFile(fileobj=compressed_file)
    return decompressed_file

def extract_titles(input_file, max_num=40000):
    """Extract title and publication year of dblp papers, given as input file.
    
    Divide the papers into 3 time periods. 
    
    Collect max max_num papers per time period.
    """
    pairs_before_1990 = []
    count_before_1990 = 0
    pairs_from_1990_to_2009 = []
    count_from_1990_to_2009 = 0
    pairs_from_2010 = []
    count_from_2010 = 0
    got_title = False
    for line in tqdm(input_file):
        line_str = line.decode('utf-8')
        if got_title: 
            # we have a title and check for the corresponding year
            year_result = re.search(r'<year>(.*)</year>', line_str)
            if year_result:
                # we also have the year and thus save the title-year pair
                year = int(year_result.group(1))
                if year < 1990:
                    pairs_before_1990.append((title, year))
                    count_before_1990 += 1
                elif year < 2010:
                    pairs_from_1990_to_2009.append((title, year))
                    count_from_1990_to_2009 += 1
                else:
                    pairs_from_2010.append((title, year))
                    count_from_2010 += 1
                got_title = False
        else:
            # we have no title and search for title
            result = re.search(r'<title>(.*)</title>', line_str)
            if result:
                title = result.group(1)
                if len(title.split(' ')) < 3:  
                    # only include titles with at least four words
                    continue
                got_title = True
        
        if count_before_1990 >= max_num and count_from_1990_to_2009 >= max_num and count_from_2010 >= max_num:
            return pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010
    
    return pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010

def save_data(pairs, file_path):
    with open(file_path, 'w') as fout:
        writer = csv.writer(fout)
        for pair in pairs:
            writer.writerow(pair)

in_file = load_gzip_file(url)
pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010 = extract_titles(in_file)
save_data(pairs_before_1990, path_before_1990)
save_data(pairs_from_1990_to_2009, path_from_1990_to_2009)
save_data(pairs_from_2010, path_from_2010)

Mounted at /content/drive


14922037it [00:32, 457292.96it/s]


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Utility classes

In [None]:
# make class to preprocess text data
class PreprocessText:
  """
  DESCRIPTION: Class to process tweets. 
  NOTE: This class also deletes stop words
  """
  def preprocess(self, text):
    #This function is used to return the preprocessed text
    text = [self.__lowercase(title) for title in text]
    text = [self.__remove_stopwords(title) for title in text]
    text = [self.__lemmatize(title) for title in text]
    text = [self.__deletePunctuation(title) for title in text]

    return text

  def __lowercase(self, text):
    #This function is used to convert the text into lowercase
    #for example: Hello,HELLO,hello all are converted into hello
    return text.lower()

  def __remove_stopwords(self, text):
    #This function is used to remove the stopwords from the text
    #stopwords are the words which are not important for the text
    #like a,an,the,etc
    #we are using nltk library to remove the stopwords
    #we are using the english stopwords
    stopwords = nltk.corpus.stopwords.words('english')
    return ' '.join([word for word in text.split() if word not in stopwords])

  def __lemmatize(self, text):
    #This function is used to lemmatize the text
    #lemmatization is the process of converting the word into its root word
    #for example: running,runs,ran,run
    #all these words are converted into run
    #we are using nltk library to lemmatize the text
    #we are using the wordnet lemmatizer
    
    lemmatizer = nltk.WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    
  def __deletePunctuation(self, text):
    #This function is used to remove the punctuations from the text
    #for LDA, numerals and punctuation are not of importance.
    return re.sub(r'[^\w\s]','',text)

PP = PreprocessText()

In [None]:
class LDAModeler:
  def __init__(self, Prepro_object, path_to_file, num_lda_topics):
    """
    DESCRIPTION: Object that makes it simple to apply topic modeling.
    USE: To use this class, simply initialise it, no need to save it in a variable
    """
    #define attributes
    self.PP = Prepro_object
    self.path_to_file = path_to_file
    self.num_lda_topics = num_lda_topics

    #make a list with all titles
    with open(self.path_to_file) as fin:
        self.reader = csv.reader(fin)
        self.titles = [row[0] for row in self.reader]

    #preprocess all the titles
    self.titles = self.PP.preprocess(self.titles)

    print("------------------------------------")
    print("Preprocessed Titles:")
    print(self.titles[:10])
    print("------------------------------------")

    #setup for Latent Dirichlet Allocation. In particular use tf-idf.
    self.num_features = 10000
    self.tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=self.num_features, stop_words='english')
    self.tf = self.tf_vectorizer.fit_transform(self.titles)
    self.tf_feature_names = self.tf_vectorizer.get_feature_names_out()

    #train model
    self.lda = LatentDirichletAllocation(n_components=self.num_lda_topics, max_iter=5, learning_method='online', random_state=42).fit(self.tf)

    #report output
    print("\n")
    print("------------------------------------")
    print("Topics:")
    for topic_idx, topic in enumerate(self.lda.components_):
      print(f'Topic {topic_idx}:', end=' ')
      print(' '.join([self.tf_feature_names[i] for i in topic.argsort()[:-12 - 1:-1]]))
    print("------------------------------------")

# Task 1

## Until 1990

In [None]:
LDAModeler(PP, path_before_1990, 5)

#Topic 0 = network analysis
#Topic 1 = systems design
#Topic 2 = topics in AI
#Topic 3 = Data systems and theory
#Topic 4 = pattern recognition 

------------------------------------
Preprocessed Titles:
['object model capability distributed object management', 'distributed object management technology', 'muffin distributed database machine', 'algebraical optimization ftaexpressions', 'wissensrepraumlsentation und maschinelles lernen', 'algebraic characterization stuf', 'zur systemarchitektur von lilog', 'mengenorientierte auswertung von anfragen der logikprogrammiersprache prolog', 'definite resolution constraint language', 'dokumentation der syntax der liloggrammatik']
------------------------------------


------------------------------------
Topics:
Topic 0: algorithm problem networks logic parallel digital chemical computing von processing evaluation circuits
Topic 1: design information approach application programming systems logic graph implementation development adaptive multiple
Topic 2: computer control linear new method software time technique dynamic number optimal comment
Topic 3: systems analysis data theory networ

<__main__.LDAModeler at 0x7f95ee3c7520>

In [None]:
LDAModeler(PP, path_before_1990, 20)

#T0 = Digital Circuits
#T1 = incoherent
#T2 = natural language processing
#T3 = incoherent
#T4 = Data Systems and theory
#T5 = incoherent
#T6 = Theoretical Computer Science
#T7 = Computer Vision
#T8 = Empirical Software Engineering
#T9 = Software engineering
#T10 = Data Structures and Algorithms
#T11 = Discrete Mathematics in Computer Science
#T12 = Data Structures and Algorithms
#T13 = Pattern recognition
#T14 = Survey Analytics
#T15 = Optimization
#T16 = Computer Fundamentals
#T17 = Data Structures and Algorithms
#T18 = incoherent
#T20 = Database Systems

------------------------------------
Preprocessed Titles:
['object model capability distributed object management', 'distributed object management technology', 'muffin distributed database machine', 'algebraical optimization ftaexpressions', 'wissensrepraumlsentation und maschinelles lernen', 'algebraic characterization stuf', 'zur systemarchitektur von lilog', 'mengenorientierte auswertung von anfragen der logikprogrammiersprache prolog', 'definite resolution constraint language', 'dokumentation der syntax der liloggrammatik']
------------------------------------


------------------------------------
Topics:
Topic 0: optimal circuits switching synthesis search bound knowledge graphs comparison editor letter special
Topic 1: control problem approach dynamic detection error performance array applications code models propositional
Topic 2: language implementation use estimation generation test natural robot multiprocessor computational decomposition single
Topic 3: communication chemica

<__main__.LDAModeler at 0x7f95ee3c7dc0>

## Between 1990 and 2009


In [None]:
LDAModeler(PP, path_from_1990_to_2009, 5)

#T0 = Cyber Security
#T1 = Data Systems and Theory
#T2 = Regression
#T3 = Signal Processing
#T4 = Data Structures and Algorithms

------------------------------------
Preprocessed Titles:
['evaluation objectoriented dbms developments 1994 edition', 'darwin incremental migration legacy information system', 'integrating heterogeneous autonomous distributed application using dom prototype', 'integrating objectoriented application middleware relational databases', 'towards transaction management system dom', 'risc object model object system interoperation concept applications', 'metaobject protocol concept risc object model', 'object data language facility multimedia data types', 'object data model facility multimedia data types', 'experiment dispatching distributed object system']
------------------------------------


------------------------------------
Topics:
Topic 0: model robust detection evaluation parallel function graph communication class recognition logic applications
Topic 1: systems analysis design data information study performance scheme structure technology theory computing
Topic 2: control using met

<__main__.LDAModeler at 0x7f95edeac340>

In [None]:
LDAModeler(PP, path_from_1990_to_2009, 20)

#T0 = Machine Learning
#T1 = Data Structures and Algorithms
#T2 = Machine Learning on Medical Data
#T3 = Machine Learning on Medical Data
#T4 = Sustainanility and Digitalisation
#T5 = incoherent
#T6 = incoherent
#T7 = Mobile Applications
#T8 = Explainable AI
#T9 = Machine Learning on Video Data 
#T10 = Distributed Systems
#T11 = incoherent
#T12 = incoherent
#T13 = incoherent
#T14 = incoherent
#T15 = Foundations of Data Science
#T16 = Web Development
#T17 = Signal Processing
#T18 = Self Driving Cars
#T19 = Network Analysis

------------------------------------
Preprocessed Titles:
['evaluation objectoriented dbms developments 1994 edition', 'darwin incremental migration legacy information system', 'integrating heterogeneous autonomous distributed application using dom prototype', 'integrating objectoriented application middleware relational databases', 'towards transaction management system dom', 'risc object model object system interoperation concept applications', 'metaobject protocol concept risc object model', 'object data language facility multimedia data types', 'object data model facility multimedia data types', 'experiment dispatching distributed object system']
------------------------------------


------------------------------------
Topics:
Topic 0: networks neural detection evaluation graphs issue sensor internet code representation modelling images
Topic 1: data structure development theory software computing prediction automatic functions engineering change role
Topic 2: identification proc

<__main__.LDAModeler at 0x7f95edeac310>

## After 2010

In [None]:
LDAModeler(PP, path_from_2010, 5)
#T0 = Efficient Machine Learning
#T1 = Deep Learning
#T2 = Machine Learning
#T3 = Optimization Algorithms
#T4 = Self-Supervised Learning

------------------------------------
Preprocessed Titles:
['spectre attacks exploiting speculative execution', 'computer science curriculum 2013', 'difference productivity impact across different computer science subareas', 'klaus tschira stiftung gemeinnuumltzige gmbh kt', 'catchment classification runoff behaviour selforganizing map som', 'analysis projected hydrological behavior catchment based signature index', 'ear shape biometric identification', 'multithreaded implementation cryptography cryptanalysis', 'privacypreserving authentication wireless access networks', 'private key cryptosystem']
------------------------------------


------------------------------------
Topics:
Topic 0: model network networks detection problem neural efficient wireless analysis mobile framework sensor
Topic 1: data estimation optimization performance hybrid social evaluation online graph approach analysis information
Topic 2: using based adaptive study new time classification prediction state case pr

<__main__.LDAModeler at 0x7f95edec9790>

In [None]:
LDAModeler(PP, path_from_2010, 20)

#T0 = Distributed Systems
#T1 = Theoretical Computer Science
#T2 = 3d Modeling
#T3 = discrete Mathematics
#T4 = Communication Technology
#T5 = Virtual Reality
#T6 = Computing
#T7 = Real-Time Machine Learning
#T8 = Sensor Technology
#T9 = Machine Learning in Health Care
#T10 = Incoherent
#T11 = Bayesian Statistics
#T12 = Cyber Security
#T13 = Machine Learning in Finance
#T14 = Social Networks
#T15 = Incoherent
#T16 = Statistics
#T17 = Cloud Computing
#T18 = Computer Networks
#T19 = Computer Vision

------------------------------------
Preprocessed Titles:
['spectre attacks exploiting speculative execution', 'computer science curriculum 2013', 'difference productivity impact across different computer science subareas', 'klaus tschira stiftung gemeinnuumltzige gmbh kt', 'catchment classification runoff behaviour selforganizing map som', 'analysis projected hydrological behavior catchment based signature index', 'ear shape biometric identification', 'multithreaded implementation cryptography cryptanalysis', 'privacypreserving authentication wireless access networks', 'private key cryptosystem']
------------------------------------


------------------------------------
Topics:
Topic 0: learning distributed framework management computing distribution approach scheduling impact global architecture cooperative
Topic 1: solution function feature problems robot graphs solving reduction localization edge extraction connectivity
Topic 2: approach new online state 3d service decision proces

<__main__.LDAModeler at 0x7f95edec93a0>

# Task 2

In [None]:
class CTMModeler:
  

  def __init__(self, Prepro_object, path_to_file, num_ctm_topics):
    self.PP = Prepro_object
    self.path_to_file = path_to_file
    self.num_ctm_topics = num_ctm_topics
    self.stopwords = list(nltk.corpus.stopwords.words("english"))

    #make a list with all titles
    with open(self.path_to_file) as fin:
        self.reader = csv.reader(fin)
        self.titles = [row[0] for row in self.reader]

    #preprocess all the titles
    self.titles = self.PP.preprocess(self.titles)

    print("------------------------------------")
    print("Preprocessed Titles:")
    print(self.titles[:10])
    print("------------------------------------")

    self.sp = WhiteSpacePreprocessingStopwords(self.titles, stopwords_list=self.stopwords)
    self.preprocessed_documents, self.unpreprocessed_corpus, self.vocab, self.retained_indices = self.sp.preprocess()

    self.tp = TopicModelDataPreparation("all-mpnet-base-v2")

    self.training_dataset = self.tp.fit(text_for_contextual=self.unpreprocessed_corpus, text_for_bow=self.preprocessed_documents)

    self.ctm = CombinedTM(
        bow_size=len(self.tp.vocab),
        contextual_size=768,
        n_components=self.num_ctm_topics,
        num_epochs=10
        )
    
    self.ctm.fit(self.training_dataset) # run the model

    #report output
    print("\n")
    print("------------------------------------")
    print("Topics:")
    for topic_idx, topic in enumerate(self.ctm.get_topic_lists(12)):
      topic_str = " ".join(topic)
      print(f"Topic {topic_idx}: {topic_str}")
    print("------------------------------------")


## Before 1990

In [None]:
CTMModeler(PP, path_before_1990, 5)

#T0 = incoherent
#T1 = Machine Learning
#T2 = Computer Vision
#T3 = Theoretical Computer Science
#T4 = Software Systems

------------------------------------
Preprocessed Titles:
['object model capability distributed object management', 'distributed object management technology', 'muffin distributed database machine', 'algebraical optimization ftaexpressions', 'wissensrepraumlsentation und maschinelles lernen', 'algebraic characterization stuf', 'zur systemarchitektur von lilog', 'mengenorientierte auswertung von anfragen der logikprogrammiersprache prolog', 'definite resolution constraint language', 'dokumentation der syntax der liloggrammatik']
------------------------------------




Batches:   0%|          | 0/198 [00:00<?, ?it/s]

Epoch: [10/10]	 Seen Samples: [394540/394540]	Train Loss: 32.74007943722337	Time: 0:00:10.348136: : 10it [01:43, 10.40s/it]
Sampling: [20/20]: : 20it [02:54,  8.70s/it]



------------------------------------
Topics:
Topic 0: de zur mit proposed student von period session die copyright hard manufacturing
Topic 1: control linear systems problem note optimal programming time system model method analysis
Topic 2: parallel digital algorithm using pattern recognition sequential image binary memory automatic efficient
Topic 3: logic modal theorem propositional type completeness calculus extension set axiom functions recursive
Topic 4: information computer design data language system software network database structure research management
------------------------------------





<__main__.CTMModeler at 0x7fb8f1577d90>

## Between 1990 and 2010

In [None]:
CTMModeler(PP, path_from_1990_to_2009, 5)

#T0 = Computer Vision
#T1 = Web Development
#T2 = Computer Networks
#T3 = Self-driving Cars
#T4 = Foundations of Data Science

------------------------------------
Preprocessed Titles:
['evaluation objectoriented dbms developments 1994 edition', 'darwin incremental migration legacy information system', 'integrating heterogeneous autonomous distributed application using dom prototype', 'integrating objectoriented application middleware relational databases', 'towards transaction management system dom', 'risc object model object system interoperation concept applications', 'metaobject protocol concept risc object model', 'object data language facility multimedia data types', 'object data model facility multimedia data types', 'experiment dispatching distributed object system']
------------------------------------




Batches:   0%|          | 0/1631 [00:00<?, ?it/s]

Epoch: [10/10]	 Seen Samples: [3260080/3260080]	Train Loss: 37.76361802346791	Time: 0:01:25.529023: : 10it [14:02, 84.23s/it]
Sampling: [20/20]: : 20it [22:52, 68.61s/it]



------------------------------------
Topics:
Topic 0: using image data model analysis based recognition neural classification detection approach feature
Topic 1: information development web software issue research knowledge technology special computer case electronic
Topic 2: networks wireless control performance network adaptive scheme mobile protocol sensor communication channel
Topic 3: overlapping obstacle various spatially precision mutual nonstationary angle merging vibration nonuniform cross
Topic 4: problem nonlinear linear method solution equation problems algorithm class equations numerical optimization
------------------------------------





<__main__.CTMModeler at 0x7fb8f6a5dd00>

## After 2010

In [None]:
CTMModeler(PP, path_from_2010, 5)

#T0 = Incoherent
#T1 = Optimization
#T2 = Computer Networks
#T3 = Deep Learning
#T4 = Applications of AI

------------------------------------
Preprocessed Titles:
['spectre attacks exploiting speculative execution', 'computer science curriculum 2013', 'difference productivity impact across different computer science subareas', 'klaus tschira stiftung gemeinnuumltzige gmbh kt', 'catchment classification runoff behaviour selforganizing map som', 'analysis projected hydrological behavior catchment based signature index', 'ear shape biometric identification', 'multithreaded implementation cryptography cryptanalysis', 'privacypreserving authentication wireless access networks', 'private key cryptosystem']
------------------------------------




Batches:   0%|          | 0/4097 [00:00<?, ?it/s]

Epoch: [10/10]	 Seen Samples: [8192460/8192460]	Train Loss: 45.190276543454765	Time: 0:03:33.458686: : 10it [35:46, 214.64s/it]
Sampling: [20/20]: : 20it [56:36, 169.83s/it]




------------------------------------
Topics:
Topic 0: study research case social development technology review software special information issue use
Topic 1: control nonlinear problem linear system method fuzzy equation solution stochastic stability adaptive
Topic 2: networks wireless sensor energy scheme power communication allocation distributed scheduling efficient routing
Topic 3: image using detection classification learning based deep neural feature recognition network segmentation
Topic 4: quantization cascade cascaded noisy weakly optimizer nonstationary kmeans inversion covariance angle vessel
------------------------------------


<__main__.CTMModeler at 0x7fb8076fda00>