In [2]:
import pandas as pd
import numpy as np
import re
import gensim
# import stop_words

from gensim import corpora
from gensim import models
from gensim.corpora.dictionary import Dictionary

from gensim.parsing.preprocessing import STOPWORDS

from time import time

import string

import logging
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)

# Reading the data

The data is stored in the following files, one per each framework programme (the links point to the original sources in the [European Union Open Data Portal](https://data.europa.eu/euodp/en/data/):
  * [`EU_raw_data/cordisfp4complete.csv`](http://cordis.europa.eu/data/cordisfp4complete.csv)
  * [`EU_raw_data/cordis-fp5projects.csv`](http://cordis.europa.eu/data/cordis-fp5projects.csv)
  * [`EU_raw_data/cordis-fp6projects.csv`](http://cordis.europa.eu/data/cordis-fp6projects.csv)
  * [`EU_raw_data/cordis-fp7projects.csv`](http://cordis.europa.eu/data/cordis-fp7projects.csv)
  * [`EU_raw_data/cordis-h2020projects.csv`](http://cordis.europa.eu/data/cordis-h2020projects.csv)
  
We read each one of them in turn. Note that we will be using the objectives column, which has several empty values, so we have to ensure that it is treated as string.

In [5]:
dataFP4 = pd.read_csv("EU_raw_data/cordisfp4complete.csv", sep=";", converters = {'Objectives': str})
dataFP4.head(2)

Unnamed: 0,rcn,title,Start Date,End Date,Duration,Status,Contract Number,Keywords,Date of Signature,Total Cost,...,General Information,Achievements,objective,Activity Area,Contract Type,Subject,Framework Programme,PGA,Coordinator Country,Contractor Country
0,29005,Spot IV-V?g?tation,01/04/1995,31/07/1997,,Completed,ENV4950001,,,,...,,,,Research and development work for potential fu...,CSC - Cost-sharing contracts,Environmental Protection; Forecasting; Meteoro...,Fourth Framework Programme,FP4-ENV 2C,FR,
1,30802,Formation and occurrence of nitrous acd in the...,01/02/1996,31/07/1998,,Completed,ENV4950055,,,,...,%L Nitrous acid is of particular importance in...,,%LTo understand the mechanisms leading to the ...,Tropospheric physics and chemistry,CSC - Cost-sharing contracts,Environmental Protection; Forecasting; Measure...,Fourth Framework Programme,FP4-ENV 2C,IT,CH; DE; DE; DE; DE; DK; FR; GB; GR


In [6]:
dataFP5 = pd.read_csv("EU_raw_data/cordis-fp5projects.csv", sep=";", converters = {'objective': str})
dataFP5.head(2)

Unnamed: 0,rcn,id,acronym,status,programme,topics,frameworkProgramme,title,startDate,endDate,...,objective,totalCost,ecMaxContribution,call,fundingScheme,coordinator,coordinatorCountry,participants,participantCountries,subjects
0,64570,QLK5-CT-2001-00934,GEDIFLUX,,FP5-LIFE QUALITY,1.1.1.-5.,FP5-LIFE QUALITY,Genetic diversity in agriculture: temporal flu...,2001-10-01,2004-09-30,...,The overall objective of this project is to de...,3039812.0,1846635.0,,CSC,NIAB,UK,JOHN INNES CENTRE;INSTITUTE OF PLANT GENETICS ...,UK;DE;FR;NL,ECO;SEA;LIF;ENV;AGR
1,64192,QLK3-CT-2001-00278,NANOCELL,,FP5-LIFE QUALITY,1.1.1.-3.,FP5-LIFE QUALITY,Sensing and controlling single molecules by no...,2002-01-01,2004-12-31,...,This project concerns controlling and sensing ...,2633658.0,1853271.0,,CSC,GOETEBORG UNIVERSITY,SE,UNIVERSITY OF GLASGOW;FRAUNHOFER IAF;LGC LIMIT...,UK;DE;SE;CH,BIO;LIF;ENV;MED;WAS;ITT


In [7]:
dataFP6 = pd.read_csv("EU_raw_data/cordis-fp6projects.csv", sep=";", converters = {'objective': str})
dataFP6.head(2)

Unnamed: 0,rcn,reference,acronym,status,programme,topics,frameworkProgramme,title,startDate,endDate,...,objective,totalCost,ecMaxContribution,call,fundingScheme,coordinator,coordinatorCountry,participants,participantCountries,subjects
0,71920,4182,AMIGO,,FP6-IST,IST-2002-2.3.2.3,,Amigo Ambient Intelligence for the networked h...,2004-09-01,2008-02-29,...,The networked home environment leads to many n...,2403487100,1296000000,FP6-2003-IST-2,IP,PHILIPS ELECTRONICS NEDERLAND B.V.,NL,SINGKIOULAR LOTZIK ANONYMI ETAIRIA PLIROFORIAK...,EL;NL;FR;IT;ES;DE;FI,IPS
1,85502,36495,GENRISK-T,,FP6-EURATOM-RADPROT,RAD PROT-2005/6-3.3.1.1-2,,Genetic component of the low dose risk of thyr...,2006-12-01,2010-09-30,...,Cancer of the non-medullary (follicular epithe...,4168377,2765453,EURATOM-2005-6-FIXEDDEADLINE,STREP,HELMHOLTZ ZENTRUM MUENCHEN DEUTSCHES FORSCHUNG...,DE,COMMISSARIAT A L'ENERGIE ATOMIQUE (CEA);UNIVER...,FR;UK;PL;BE;IT;ES;DE,BIO;RAD


In [8]:
dataFP7 = pd.read_csv("EU_raw_data/cordis-fp7projects.csv", sep=";", converters = {'objective': str})
dataFP7.head(2)

Unnamed: 0,rcn,reference,acronym,status,programme,topics,frameworkProgramme,title,startDate,endDate,...,objective,totalCost,ecMaxContribution,call,fundingScheme,coordinator,coordinatorCountry,participants,participantCountries,subjects
0,110629,611218,ALFRED,,FP7-ICT,ICT-2013.5.1,FP7,ALFRED - Personal Interactive Assistant for In...,2013-10-01,2016-09-30,...,***Personal Interactive Assistant for Independ...,444074100,342357300,FP7-ICT-2013-10,CP,ASCORA GMBH,DE,TALKAMATIC AB;STICHTING NATIONAAL OUDERENFONDS...,SE;NL;DE;ES;FR,INF
1,104117,911409,TIBETMETH,ONG,FP7-PEOPLE,FP7-PEOPLE-2011-IIF,FP7,Microbial Biomarker Records in Tibetan Peats: ...,2015-10-01,2016-09-30,...,It is crucial to understand terrestrial microb...,15000,15000,FP7-PEOPLE-2011-IIF,MC-IIFR,NORTHWEST UNIVERSITY,CN,,,SCI


In [9]:
dataH2020 = pd.read_csv("EU_raw_data/cordis-h2020projects.csv", sep=";", converters = {'objective': str})
dataH2020.head(2)

Unnamed: 0,rcn,reference,acronym,status,programme,topics,frameworkProgramme,title,startDate,endDate,...,objective,totalCost,ecMaxContribution,call,fundingScheme,coordinator,coordinatorCountry,participants,participantCountries,subjects
0,193982,643052,C-CASCADES,SIGNED,H2020-EU.1.3.1.,MSCA-ITN-2014-ETN,H2020,Carbon Cascades from Land to Ocean in the Anth...,2015-01-01,2018-12-31,...,C-CASCADES will produce a new generation of yo...,312573348,312573348,H2020-MSCA-ITN-2014,MSCA-ITN-ETN,UNIVERSITE LIBRE DE BRUXELLES,BE,THE UNIVERSITY OF EXETER;UPPSALA UNIVERSITET;M...,UK;SE;DE;FR;NL,
1,193979,643045,WAKEUPCALL,SIGNED,H2020-EU.1.3.1.,MSCA-ITN-2014-EID,H2020,Applied mathematics for risk measures in finan...,2015-01-01,2018-12-31,...,The EID WAKEUPCALL has been set up with the kn...,152261712,152261712,H2020-MSCA-ITN-2014,MSCA-ITN-EID,STICHTING CENTRUM VOOR WISKUNDE EN INFORMATICA,NL,VORTECH BV;ANALISTAS FINANCIEROS INTERNACIONAL...,NL;ES;IT;UK,


As only the FP4 data come in capitalised columns with multiple words, we'll turn them to lowercase and change spaces to underscores. We will also rename some columns, to be in tune with the other framework programs.

In [10]:
dataFP4.columns = map(str.lower, dataFP4.columns)
dataFP4.columns = map(lambda x: x.replace(' ', '_'), dataFP4.columns)
dataFP4.rename(columns = {
        'project_title': 'title',
        'objectives': 'objective',
        'subject': 'subjects' 
    }, inplace=True)
dataFP4.columns

Index([u'rcn', u'title', u'start_date', u'end_date', u'duration', u'status',
       u'contract_number', u'keywords', u'date_of_signature', u'total_cost',
       u'total_funding', u'project_website', u'project_call',
       u'project_acronym', u'general_information', u'achievements',
       u'objective', u'activity_area', u'contract_type', u'subjects',
       u'framework_programme', u'pga', u'coordinator_country',
       u'contractor_country'],
      dtype='object')

Now we'll get a view on the dataframe.

In [11]:
dataFP4['framework_programme'] = 'FP4'
df4 = dataFP4[['rcn', 'title', 'objective', 'subjects', 'framework_programme']]
df4.to_csv("dataset/euFP4.csv", sep = ';', encoding='utf-8')
df4.head()

Unnamed: 0,rcn,title,objective,subjects,framework_programme
0,29005,Spot IV-V?g?tation,,Environmental Protection; Forecasting; Meteoro...,FP4
1,30802,Formation and occurrence of nitrous acd in the...,%LTo understand the mechanisms leading to the ...,Environmental Protection; Forecasting; Measure...,FP4
2,31031,Process for Production of Light Olefins by Deh...,,Industrial Manufacture; Materials Technology,FP4
3,30803,High resolution diode laser carbon dioxide env...,%LTo develop a new instrument for measuring at...,Environmental Protection; Measurement Methods;...,FP4
4,31004,Subsurface Radar as a Tool for Non-destructive...,,Industrial Manufacture; Materials Technology; ...,FP4


Same with the FP5 data.

In [12]:
dataFP5.rename(columns = {
        'frameworkProgramme': 'framework_programme'
    }, inplace=True)
dataFP5['framework_programme'] = 'FP5'
df5 = dataFP5[['rcn', 'title', 'objective', 'subjects', 'framework_programme']]
df5.to_csv("dataset/euFP5.csv", sep = ';', encoding='utf-8')
df5.head()

Unnamed: 0,rcn,title,objective,subjects,framework_programme
0,64570,Genetic diversity in agriculture: temporal flu...,The overall objective of this project is to de...,ECO;SEA;LIF;ENV;AGR,FP5
1,64192,Sensing and controlling single molecules by no...,This project concerns controlling and sensing ...,BIO;LIF;ENV;MED;WAS;ITT,FP5
2,61977,Transduction mechanisms for non-noxious and no...,,,FP5
3,54932,Portable measurement systems for atmospheric p...,The primary objective of the proposed project ...,SEA;MET;ENV;FOR,FP5
4,56044,Benthic primary production - carbon cycling an...,,,FP5


Then with FP6 data.

In [13]:
dataFP6.rename(columns = {
        'frameworkProgramme': 'framework_programme'
    }, inplace=True)
dataFP6['framework_programme'] = 'FP6'
df6 = dataFP6[['rcn', 'title', 'objective', 'subjects', 'framework_programme']]
df6.to_csv("dataset/euFP6.csv", sep = ';', encoding='utf-8')
df6.head()

Unnamed: 0,rcn,title,objective,subjects,framework_programme
0,71920,Amigo Ambient Intelligence for the networked h...,The networked home environment leads to many n...,IPS,FP6
1,85502,Genetic component of the low dose risk of thyr...,Cancer of the non-medullary (follicular epithe...,BIO;RAD,FP6
2,74968,European food information resource network,EuroFIR will form a world-leading collaboratio...,IPS;FOO,FP6
3,74155,Global allergy and asthma european network,Allergic diseases and asthma pose an important...,SEA;LIF;MED;FOO;AGR,FP6
4,74297,Advanced Protection Systems (APROSYS),The IP on Advanced Protective Systems (APROSYS...,,FP6


And with FP7.

In [14]:
dataFP7.rename(columns = {
        'frameworkProgramme': 'framework_programme'
    }, inplace=True)
dataFP7['framework_programme'] = 'FP7'
df7 = dataFP7[['rcn', 'title', 'objective', 'subjects', 'framework_programme']]
df7.to_csv("dataset/euFP7.csv", sep = ';', encoding='utf-8')
df7.head()

Unnamed: 0,rcn,title,objective,subjects,framework_programme
0,110629,ALFRED - Personal Interactive Assistant for In...,***Personal Interactive Assistant for Independ...,INF,FP7
1,104117,Microbial Biomarker Records in Tibetan Peats: ...,It is crucial to understand terrestrial microb...,SCI,FP7
2,188177,Post-glacial recolonisation and Holocene anthr...,"At the end of last glaciation, ca. 15 000 cal....",,FP7
3,188066,Molecular Mechanisms Employed by the Newly Ass...,Posttranscriptional gene regulation is an esse...,,FP7
4,187919,Identifying the targets and mechanism of actio...,The Ubiquitin (UB) and SUMO modification pathw...,,FP7


And finally with H2020.

In [15]:
dataH2020.rename(columns = {
        'frameworkProgramme': 'framework_programme'
    }, inplace=True)
dataH2020['framework_programme'] = 'H2020'
df20 = dataH2020[['rcn', 'title', 'objective', 'subjects', 'framework_programme']]
df20.to_csv("dataset/euH2020.csv", sep = ';', encoding='utf-8')
df20.head()

Unnamed: 0,rcn,title,objective,subjects,framework_programme
0,193982,Carbon Cascades from Land to Ocean in the Anth...,C-CASCADES will produce a new generation of yo...,,H2020
1,193979,Applied mathematics for risk measures in finan...,The EID WAKEUPCALL has been set up with the kn...,,H2020
2,193971,BigStorage: Storage-based Convergence between ...,'The consortium of this European Training Netw...,,H2020
3,193970,Perception and Action in Complex Environments,The PACE research and training programme sits ...,,H2020
4,193969,Industrial optimal design using adjoint CFD,Adjoint-based methods have become the most int...,,H2020


# Getting all Objectives

We will get all the objectives from all the projects of all the calls and place them in a single column.

In [14]:
all_objectives = pd.concat([ df['objective'] for df in [df4, df5, df6, df7, df20] ])
all_objectives.shape

(76522,)

At this poin, `all_objectives` contains data that cannot be tokenized propertly, starting with `%L`. We will turn everything to lowercase, remove `%l`, and remove all punctutation while we are at it.

In [15]:
import string

RE_PUNCTUATION = '|'.join([re.escape(x) for x in string.punctuation])

all_objectives = all_objectives.str.lower().str.replace('%l', '').str.replace(RE_PUNCTUATION, ' ')
all_objectives.head(10)

0                                                     
1    to understand the mechanisms leading to the fo...
2                                                     
3    to develop a new instrument for measuring atmo...
4                                                     
5    to determine by measurements and analysis the ...
6    degree aims at investigating the modifications...
7                                                     
8    the problem of the agreement between analytica...
9                                                     
Name: objective, dtype: object

Note that gensim assumes that documents are lists of tokens. Therefore we will create a new Series object containing the list of tokens in `all_objectives` rows.

In [16]:
all_objectives_split = all_objectives.str.split()
all_objectives_split.head(10)

0                                                   []
1    [to, understand, the, mechanisms, leading, to,...
2                                                   []
3    [to, develop, a, new, instrument, for, measuri...
4                                                   []
5    [to, determine, by, measurements, and, analysi...
6    [degree, aims, at, investigating, the, modific...
7                                                   []
8    [the, problem, of, the, agreement, between, an...
9                                                   []
Name: objective, dtype: object

In [17]:
additional_stopwords = set(['computer', 'will', 'develop', 'development',
                            'project', 'research', 'new', 'use', 
                            'europe', 'european'])
stopwords = set(STOPWORDS) | additional_stopwords
all_objectives_split = all_objectives_split.apply(lambda tokens: [token for token in tokens if token not in stopwords])

# Construct a Dictionary and a Corpus

The next step is to construct a dictionary and a corpus from `all_objectives` using gensim.

In [18]:
objectives_dictionary = Dictionary(all_objectives_split)

INFO : adding document #0 to Dictionary(0 unique tokens: [])
INFO : adding document #10000 to Dictionary(18305 unique tokens: [u'xylem', u'catechols', u'countriesvi', u'institutionalised', u'testbeds']...)
INFO : adding document #20000 to Dictionary(38945 unique tokens: [u'catechols', u'reelay', u'nualart', u'circuitry', u'hanging']...)
INFO : adding document #30000 to Dictionary(57365 unique tokens: [u'nmss', u'catechols', u'solsic', u'cyclophophamide', u'reelay']...)
INFO : adding document #40000 to Dictionary(83951 unique tokens: [u'nmss', u'catechols', u'solsic', u'cyclophophamide', u'reelay']...)
INFO : adding document #50000 to Dictionary(109527 unique tokens: [u'catechols', u'cyclophophamide', u'spidera', u'nualart', u'prohitech']...)
INFO : adding document #60000 to Dictionary(131073 unique tokens: [u'catechols', u'cyclophophamide', u'spidera', u'nualart', u'prohitech']...)
INFO : adding document #70000 to Dictionary(149308 unique tokens: [u'catechols', u'cyclophophamide', u'sp

To create the corpus, we will use a helper class, that will, per gensim documentation, provide the bag-of-words represantation of each document on demand. In particular, it will yield the bag-of-words representation of each successive row of the `all_objectives_split` Series.

In [19]:
class ObjectivesCorpus(object):
    def __init__(self, documents, dictionary):
        self.documents = documents
        self.dictionary = dictionary
    def __iter__(self):
        for document in self.documents:
            yield self.dictionary.doc2bow(document)

So now we can generate our corpus.

In [75]:
objectives_corpus = ObjectivesCorpus(all_objectives_split, objectives_dictionary)

# Run LDA

We are in a position to run LDA on the corpus we created. We will use the multi-core implementation, to reap any possible benefits from parallelisation.

In [76]:
t0 = time()
lda = gensim.models.ldamulticore.LdaMulticore(corpus=objectives_corpus, 
                                              id2word=objectives_dictionary, 
                                              num_topics=10, 
                                              iterations=50,
                                              passes=5)
print("done in %0.3fs." % (time() - t0))

INFO : using symmetric alpha at 0.1
INFO : using symmetric eta at 0.1
INFO : using serial LDA version on this node
INFO : running online LDA training, 10 topics, 5 passes over the supplied corpus of 76522 documents, updating every 14000 documents, evaluating every ~76522 documents, iterating 50x with a convergence threshold of 0.001000
INFO : training LDA model using 7 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #2000/76522, outstanding queue size 1
INFO : PROGRESS: pass 0, dispatched chunk #1 = documents up to #4000/76522, outstanding queue size 2
INFO : PROGRESS: pass 0, dispatched chunk #2 = documents up to #6000/76522, outstanding queue size 3
INFO : PROGRESS: pass 0, dispatched chunk #3 = documents up to #8000/76522, outstanding queue size 4
INFO : PROGRESS: pass 0, dispatched chunk #4 = documents up to #10000/76522, outstanding queue size 5
INFO : PROGRESS: pass 0, dispatched chunk #5 = documents up to #12000/76522, outstanding queue size 6
INFO : PRO

done in 290.146s.


In [77]:
lda.print_topics(10)

INFO : topic #0 (0.100): 0.010*"market" + 0.010*"data" + 0.006*"based" + 0.006*"technology" + 0.005*"software" + 0.005*"time" + 0.005*"systems" + 0.005*"design" + 0.005*"business" + 0.004*"cost"
INFO : topic #1 (0.100): 0.016*"energy" + 0.005*"high" + 0.005*"power" + 0.005*"fuel" + 0.005*"climate" + 0.005*"water" + 0.004*"cost" + 0.004*"technology" + 0.004*"solar" + 0.004*"heat"
INFO : topic #2 (0.100): 0.006*"high" + 0.006*"quantum" + 0.005*"materials" + 0.005*"properties" + 0.005*"systems" + 0.005*"based" + 0.004*"optical" + 0.004*"applications" + 0.003*"novel" + 0.003*"field"
INFO : topic #3 (0.100): 0.011*"process" + 0.008*"production" + 0.008*"materials" + 0.007*"water" + 0.007*"waste" + 0.007*"industrial" + 0.007*"products" + 0.006*"technology" + 0.005*"industry" + 0.005*"based"
INFO : topic #4 (0.100): 0.015*"innovation" + 0.010*"smes" + 0.009*"technology" + 0.006*"high" + 0.005*"management" + 0.005*"industry" + 0.005*"sme" + 0.005*"partners" + 0.004*"support" + 0.004*"training"

[(0,
  '0.010*"market" + 0.010*"data" + 0.006*"based" + 0.006*"technology" + 0.005*"software" + 0.005*"time" + 0.005*"systems" + 0.005*"design" + 0.005*"business" + 0.004*"cost"'),
 (1,
  '0.016*"energy" + 0.005*"high" + 0.005*"power" + 0.005*"fuel" + 0.005*"climate" + 0.005*"water" + 0.004*"cost" + 0.004*"technology" + 0.004*"solar" + 0.004*"heat"'),
 (2,
  '0.006*"high" + 0.006*"quantum" + 0.005*"materials" + 0.005*"properties" + 0.005*"systems" + 0.005*"based" + 0.004*"optical" + 0.004*"applications" + 0.003*"novel" + 0.003*"field"'),
 (3,
  '0.011*"process" + 0.008*"production" + 0.008*"materials" + 0.007*"water" + 0.007*"waste" + 0.007*"industrial" + 0.007*"products" + 0.006*"technology" + 0.005*"industry" + 0.005*"based"'),
 (4,
  '0.015*"innovation" + 0.010*"smes" + 0.009*"technology" + 0.006*"high" + 0.005*"management" + 0.005*"industry" + 0.005*"sme" + 0.005*"partners" + 0.004*"support" + 0.004*"training"'),
 (5,
  '0.011*"cell" + 0.009*"cells" + 0.005*"molecular" + 0.005*"dis

# Run LDA with tf-idf

After our initial set of results, we can check whether the td-idf transformation changes the picture.

In [78]:
tf_idf = models.TfidfModel(objectives_corpus)
tf_idf_corpus = tf_idf[objectives_corpus]

INFO : collecting document frequencies
INFO : PROGRESS: processing document #0
INFO : PROGRESS: processing document #10000
INFO : PROGRESS: processing document #20000
INFO : PROGRESS: processing document #30000
INFO : PROGRESS: processing document #40000
INFO : PROGRESS: processing document #50000
INFO : PROGRESS: processing document #60000
INFO : PROGRESS: processing document #70000
INFO : calculating IDF weights for 76522 documents and 162359 features (6457551 matrix non-zeros)


In [80]:
t0 = time()
lda_tf_idf = gensim.models.ldamulticore.LdaMulticore(corpus=objectives_corpus, 
                                                     id2word=objectives_dictionary, 
                                                     num_topics=10, 
                                                     iterations=50,
                                                     passes=5)
print("done in %0.3fs." % (time() - t0))


INFO : using symmetric alpha at 0.1
INFO : using symmetric eta at 0.1
INFO : using serial LDA version on this node
INFO : running online LDA training, 10 topics, 5 passes over the supplied corpus of 76522 documents, updating every 14000 documents, evaluating every ~76522 documents, iterating 50x with a convergence threshold of 0.001000
INFO : training LDA model using 7 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #2000/76522, outstanding queue size 1
INFO : PROGRESS: pass 0, dispatched chunk #1 = documents up to #4000/76522, outstanding queue size 2
INFO : PROGRESS: pass 0, dispatched chunk #2 = documents up to #6000/76522, outstanding queue size 3
INFO : PROGRESS: pass 0, dispatched chunk #3 = documents up to #8000/76522, outstanding queue size 4
INFO : PROGRESS: pass 0, dispatched chunk #4 = documents up to #10000/76522, outstanding queue size 5
INFO : PROGRESS: pass 0, dispatched chunk #5 = documents up to #12000/76522, outstanding queue size 6
INFO : PRO

done in 313.145s.


In [81]:
lda_tf_idf.print_topics(10)

INFO : topic #0 (0.100): 0.009*"quantum" + 0.008*"high" + 0.006*"optical" + 0.006*"systems" + 0.005*"based" + 0.005*"devices" + 0.004*"properties" + 0.004*"materials" + 0.004*"applications" + 0.004*"energy"
INFO : topic #1 (0.100): 0.005*"understanding" + 0.005*"cell" + 0.005*"cells" + 0.005*"climate" + 0.004*"study" + 0.004*"role" + 0.003*"processes" + 0.003*"molecular" + 0.003*"formation" + 0.003*"interactions"
INFO : topic #2 (0.100): 0.008*"training" + 0.007*"researchers" + 0.006*"eu" + 0.006*"activities" + 0.005*"innovation" + 0.005*"scientific" + 0.005*"knowledge" + 0.005*"science" + 0.005*"network" + 0.005*"international"
INFO : topic #3 (0.100): 0.007*"cell" + 0.006*"cells" + 0.006*"disease" + 0.005*"cancer" + 0.005*"clinical" + 0.004*"molecular" + 0.004*"novel" + 0.004*"human" + 0.004*"patients" + 0.004*"genetic"
INFO : topic #4 (0.100): 0.007*"social" + 0.007*"policy" + 0.007*"data" + 0.005*"analysis" + 0.005*"public" + 0.004*"information" + 0.004*"political" + 0.004*"economi

[(0,
  '0.009*"quantum" + 0.008*"high" + 0.006*"optical" + 0.006*"systems" + 0.005*"based" + 0.005*"devices" + 0.004*"properties" + 0.004*"materials" + 0.004*"applications" + 0.004*"energy"'),
 (1,
  '0.005*"understanding" + 0.005*"cell" + 0.005*"cells" + 0.005*"climate" + 0.004*"study" + 0.004*"role" + 0.003*"processes" + 0.003*"molecular" + 0.003*"formation" + 0.003*"interactions"'),
 (2,
  '0.008*"training" + 0.007*"researchers" + 0.006*"eu" + 0.006*"activities" + 0.005*"innovation" + 0.005*"scientific" + 0.005*"knowledge" + 0.005*"science" + 0.005*"network" + 0.005*"international"'),
 (3,
  '0.007*"cell" + 0.006*"cells" + 0.006*"disease" + 0.005*"cancer" + 0.005*"clinical" + 0.004*"molecular" + 0.004*"novel" + 0.004*"human" + 0.004*"patients" + 0.004*"genetic"'),
 (4,
  '0.007*"social" + 0.007*"policy" + 0.007*"data" + 0.005*"analysis" + 0.005*"public" + 0.004*"information" + 0.004*"political" + 0.004*"economic" + 0.004*"study" + 0.003*"different"'),
 (5,
  '0.010*"energy" + 0.010*

# LDA with scikit-learn

By way of comparison, we will do an LDA analysis using scikit-learn.

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

We'll ask for 10 topics again.

In [58]:
n_topics = 10

We'll create a simple `CountVectorizer`.

In [63]:
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(all_objectives)
print("done in %0.3fs." % (time() - t0))

done in 15.602s.


Let us see the size of the bag-of-words representation:

In [66]:
tf.shape

(76522, 65006)

So, we have a vocabulary of 65006 words. We'll fit our bag-of-words representation into an `LatentDirichletAllocation` object. 

In [67]:
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online',
                                random_state=0,
                                n_jobs=-1)

t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

done in 1124.946s.


The `components_` attribute of `lda` contains the topics. Each topic contains the weights of each of the words, indexed as in the `tf` object that was passed to to `lda`. We want to display the top `n_top_words` words. To do that, we get an array with the indices that would sort `components_` in ascending order, so that the most importart words are the last ones. Then we use these indices in the feature names of `tf_vectorizer` to get the corresponding words (see the original example in the [scikit-learn documentation](http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-topics-extraction-with-nmf-lda-py).

Note that the `components_` are not normalized. They are the $lambda$ used in online LDA as described in:
  * [Online Learning for Latent Dirichlet Allocation](https://www.cs.princeton.edu/~blei/papers/HoffmanBleiBach2010b.pdf), M. Hoffman, D. Blei, F. Bach, 2010.
  * [Stochastic Variational Inference](http://www.columbia.edu/~jwp2128/Papers/HoffmanBleiWangPaisley2013.pdf), M. Hoffman, D. Blei, C. Wang, J. Paisley, 2013.

We therefore normalise `components_` to make the results more comparable to those provided by gensim. The process is as in https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/models/ldamodel.py#L811-L813.

In [112]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        topic_sum = topic.sum()
        print("Topic #%d:" % topic_idx)
        print(" + ".join(["%0.3f*'%s'" % (topic[i] / topic_sum, feature_names[i])
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()


And so here are the results:

In [115]:
print("Topics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words=10)

Topics in LDA model:
Topic #0:
0.047*'research' + 0.023*'european' + 0.012*'eu' + 0.011*'project' + 0.009*'scientific' + 0.008*'policy' + 0.008*'social' + 0.008*'international' + 0.008*'science' + 0.007*'knowledge'
Topic #1:
0.036*'properties' + 0.035*'innovation' + 0.014*'surface' + 0.012*'micro' + 0.011*'potential' + 0.009*'growth' + 0.008*'spin' + 0.008*'matter' + 0.008*'recently' + 0.007*'low'
Topic #2:
0.011*'study' + 0.011*'processes' + 0.011*'models' + 0.010*'studies' + 0.010*'understanding' + 0.009*'model' + 0.009*'data' + 0.008*'project' + 0.008*'methods' + 0.007*'different'
Topic #3:
0.016*'quantum' + 0.015*'new' + 0.011*'applications' + 0.011*'optical' + 0.010*'physics' + 0.009*'devices' + 0.009*'field' + 0.008*'single' + 0.008*'light' + 0.008*'systems'
Topic #4:
0.034*'energy' + 0.022*'materials' + 0.017*'production' + 0.016*'high' + 0.013*'technology' + 0.010*'power' + 0.010*'project' + 0.010*'plant' + 0.009*'process' + 0.008*'material'
Topic #5:
0.014*'cost' + 0.013*'desi

# LDA with Stemming

This is an adaptation from Jordan's Barber [Latent Dirichlet Allocation (LDA) with Python](https://rstudio-pubs-static.s3.amazonaws.com/79360_850b2a69980c4488b1db95987a24867a.html).

In [116]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer

tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

In [118]:
# Remove common words
en_stop.extend(['will', 'develop', 'understand', 'model', 
                'project', 'research', 'new', 'use', 
                'europe', 'european'])

texts = []

for text in all_objectives:
    
    # tokenize document string
    tokens = tokenizer.tokenize(text)
        
    # remove stop words from tokens
    filtered_tokens = [i for i in tokens if not i in en_stop]
                
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in filtered_tokens]
        
    # add tokens to list
    texts.append(stemmed_tokens)

In [119]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)

INFO : adding document #0 to Dictionary(0 unique tokens: [])
INFO : adding document #10000 to Dictionary(12619 unique tokens: ['detoxif', 'clade', 'summertim', 'wavelength', 'sagnac']...)
INFO : adding document #20000 to Dictionary(27593 unique tokens: ['fell', 'clade', 'summertim', 'wavelength', 'polyethylen']...)
INFO : adding document #30000 to Dictionary(41581 unique tokens: ['fell', 'clade', 'summertim', 'wavelength', 'polyethylen']...)
INFO : adding document #40000 to Dictionary(61877 unique tokens: ['porosifi', 'clade', 'foroutsourc', 'ogen', '20188']...)
INFO : adding document #50000 to Dictionary(78615 unique tokens: ['porosifi', 'clade', 'foroutsourc', 'symphoni', 'ogen']...)
INFO : adding document #60000 to Dictionary(92600 unique tokens: ['clade', 'assmbl', 'nectarin', 'stratgic', 'eriv']...)
INFO : adding document #70000 to Dictionary(103928 unique tokens: ['clade', 'assmbl', 'nectarin', 'stratgic', 'eriv']...)
INFO : built Dictionary(112295 unique tokens: ['clade', 'assmb

In [120]:
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

In [121]:
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, id2word = dictionary)

INFO : using symmetric alpha at 0.2
INFO : using symmetric eta at 0.2
INFO : using serial LDA version on this node
INFO : running online LDA training, 5 topics, 1 passes over the supplied corpus of 76522 documents, updating model once every 2000 documents, evaluating perplexity every 20000 documents, iterating 50x with a convergence threshold of 0.001000
INFO : PROGRESS: pass 0, at document #2000/76522
INFO : merging changes from 2000 documents into a model of 76522 documents
INFO : topic #0 (0.200): 0.010*"servic" + 0.010*"develop" + 0.008*"improv" + 0.007*"object" + 0.007*"inform" + 0.007*"process" + 0.007*"applic" + 0.007*"multimedia" + 0.007*"user" + 0.006*"base"
INFO : topic #1 (0.200): 0.013*"system" + 0.008*"user" + 0.007*"develop" + 0.006*"servic" + 0.006*"object" + 0.006*"manag" + 0.006*"provid" + 0.005*"data" + 0.005*"transport" + 0.005*"network"
INFO : topic #2 (0.200): 0.016*"system" + 0.014*"develop" + 0.009*"softwar" + 0.008*"applic" + 0.006*"process" + 0.006*"inform" + 0

[(0, '0.012*"system" + 0.011*"technolog" + 0.008*"develop" + 0.008*"data" + 0.007*"servic"'), (1, '0.009*"innov" + 0.008*"eu" + 0.007*"develop" + 0.006*"polici" + 0.005*"social"'), (2, '0.009*"research" + 0.008*"train" + 0.007*"scienc" + 0.007*"theori" + 0.007*"studi"'), (3, '0.014*"cell" + 0.007*"studi" + 0.007*"function" + 0.007*"diseas" + 0.006*"protein"'), (4, '0.009*"energi" + 0.008*"materi" + 0.008*"system" + 0.007*"process" + 0.007*"product"')]


In [122]:
ldamodel.print_topics(num_topics=5, num_words=5)

INFO : topic #0 (0.200): 0.012*"system" + 0.011*"technolog" + 0.008*"develop" + 0.008*"data" + 0.007*"servic"
INFO : topic #1 (0.200): 0.009*"innov" + 0.008*"eu" + 0.007*"develop" + 0.006*"polici" + 0.005*"social"
INFO : topic #2 (0.200): 0.009*"research" + 0.008*"train" + 0.007*"scienc" + 0.007*"theori" + 0.007*"studi"
INFO : topic #3 (0.200): 0.014*"cell" + 0.007*"studi" + 0.007*"function" + 0.007*"diseas" + 0.006*"protein"
INFO : topic #4 (0.200): 0.009*"energi" + 0.008*"materi" + 0.008*"system" + 0.007*"process" + 0.007*"product"


[(0,
  '0.012*"system" + 0.011*"technolog" + 0.008*"develop" + 0.008*"data" + 0.007*"servic"'),
 (1,
  '0.009*"innov" + 0.008*"eu" + 0.007*"develop" + 0.006*"polici" + 0.005*"social"'),
 (2,
  '0.009*"research" + 0.008*"train" + 0.007*"scienc" + 0.007*"theori" + 0.007*"studi"'),
 (3,
  '0.014*"cell" + 0.007*"studi" + 0.007*"function" + 0.007*"diseas" + 0.006*"protein"'),
 (4,
  '0.009*"energi" + 0.008*"materi" + 0.008*"system" + 0.007*"process" + 0.007*"product"')]