In [4]:
import pandas as pd
import numpy as np
import re
import gensim
import stop_words

# Reading the data

The data is stored in the following files, one per each framework programme (the links point to the original sources in the [European Union Open Data Portal](https://data.europa.eu/euodp/en/data/):
  * [`dataset/cordisfp4complete.csv`](http://cordis.europa.eu/data/cordisfp4complete.csv)
  * [`dataset/cordis-fp5projects.csv`](http://cordis.europa.eu/data/cordis-fp5projects.csv)
  * [`dataset/cordis-fp6projects.csv`](http://cordis.europa.eu/data/cordis-fp6projects.csv)
  * [`dataset/cordis-fp7projects.csv`](http://cordis.europa.eu/data/cordis-fp7projects.csv)
  * [`dataset/cordis-h2020projects.csv`](http://cordis.europa.eu/data/cordis-h2020projects.csv)
  
We read each one of them in turn. Note that we will be using the objectives column, which has several empty values, so we have to ensure that it is treated as string.

In [5]:
dataFP4 = pd.read_csv("dataset/cordisfp4complete.csv", sep=";", converters = {'Objectives': str})
dataFP4.head(5)

Unnamed: 0,RCN,Project Title,Start Date,End Date,Duration,Status,Contract Number,Keywords,Date of Signature,Total Cost,...,General Information,Achievements,Objectives,Activity Area,Contract Type,Subject,Framework Programme,PGA,Coordinator Country,Contractor Country
0,29005,Spot IV-V?g?tation,01/04/1995,31/07/1997,,Completed,ENV4950001,,,,...,,,,Research and development work for potential fu...,CSC - Cost-sharing contracts,Environmental Protection; Forecasting; Meteoro...,Fourth Framework Programme,FP4-ENV 2C,FR,
1,30802,Formation and occurrence of nitrous acd in the...,01/02/1996,31/07/1998,,Completed,ENV4950055,,,,...,%L Nitrous acid is of particular importance in...,,%LTo understand the mechanisms leading to the ...,Tropospheric physics and chemistry,CSC - Cost-sharing contracts,Environmental Protection; Forecasting; Measure...,Fourth Framework Programme,FP4-ENV 2C,IT,CH; DE; DE; DE; DE; DK; FR; GB; GR
2,31031,Process for Production of Light Olefins by Deh...,01/01/1996,31/12/1998,,Completed,BRPR950116,,,,...,Objectives and content %L%LThe oil refining in...,,,Materials engineering,CSC - Cost-sharing contracts,Industrial Manufacture; Materials Technology,Fourth Framework Programme,FP4-BRITE/EURAM 3,FI,DE; FI; FR; FR
3,30803,High resolution diode laser carbon dioxide env...,01/04/1996,31/03/1999,,Completed,ENV4950033,,,,...,"%L Carbon dioxide is a very well known gas, an...",,%LTo develop a new instrument for measuring at...,Troposphere,CSC - Cost-sharing contracts,Environmental Protection; Measurement Methods;...,Fourth Framework Programme,FP4-ENV 2C,IT,DE; IT
4,31004,Subsurface Radar as a Tool for Non-destructive...,01/01/1996,31/12/1998,,Completed,BRPR950127,,,,...,Objectives and content %L%LThere has been a tr...,,,Reliability and quality of materials and products,CSC - Cost-sharing contracts,Industrial Manufacture; Materials Technology; ...,Fourth Framework Programme,FP4-BRITE/EURAM 3,DE,DE; GB; GB; GB; IT; IT; NO


In [6]:
dataFP5 = pd.read_csv("dataset/cordis-fp5projects.csv", sep=";", converters = {'objective': str})
dataFP5.head(5)

Unnamed: 0,rcn,id,acronym,status,programme,topics,frameworkProgramme,title,startDate,endDate,...,objective,totalCost,ecMaxContribution,call,fundingScheme,coordinator,coordinatorCountry,participants,participantCountries,subjects
0,64570,QLK5-CT-2001-00934,GEDIFLUX,,FP5-LIFE QUALITY,1.1.1.-5.,FP5-LIFE QUALITY,Genetic diversity in agriculture: temporal flu...,2001-10-01,2004-09-30,...,The overall objective of this project is to de...,3039812.0,1846635.0,,CSC,NIAB,UK,JOHN INNES CENTRE;INSTITUTE OF PLANT GENETICS ...,UK;DE;FR;NL,ECO;SEA;LIF;ENV;AGR
1,64192,QLK3-CT-2001-00278,NANOCELL,,FP5-LIFE QUALITY,1.1.1.-3.,FP5-LIFE QUALITY,Sensing and controlling single molecules by no...,2002-01-01,2004-12-31,...,This project concerns controlling and sensing ...,2633658.0,1853271.0,,CSC,GOETEBORG UNIVERSITY,SE,UNIVERSITY OF GLASGOW;FRAUNHOFER IAF;LGC LIMIT...,UK;DE;SE;CH,BIO;LIF;ENV;MED;WAS;ITT
2,61977,HPMD-CT-2000-00026,,,FP5-HUMAN POTENTIAL,,FP5-HUMAN POTENTIAL,Transduction mechanisms for non-noxious and no...,2000-09-01,2004-08-31,...,,119808.0,119808.0,,BUR,UNIVERSIDAD MIGUEL HERNANDEZ DE ELCHE,ES,,,
3,54932,EVK2-CT-2000-35003,,,FP5-EESD,1.1.4.-2.,FP5-EESD,Portable measurement systems for atmospheric p...,2000-07-03,2001-07-02,...,The primary objective of the proposed project ...,30000.0,22500.0,,EAW,INNOVATIVE SYSTEM -UND INFORMATIONSTECHNOLOGIE,DE,OPTOTEK D.O.O.,SI,SEA;MET;ENV;FOR
4,56044,HPMF-CT-2000-00569,,,FP5-HUMAN POTENTIAL,,FP5-HUMAN POTENTIAL,Benthic primary production - carbon cycling an...,2000-10-01,2002-09-30,...,,144002.0,144002.0,,RGI,UNIVERSITY OF COPENHAGEN,DK,,,


In [7]:
dataFP6 = pd.read_csv("dataset/cordis-fp6projects.csv", sep=";", converters = {'objective': str})
dataFP6.head(5)

Unnamed: 0,rcn,reference,acronym,status,programme,topics,frameworkProgramme,title,startDate,endDate,...,objective,totalCost,ecMaxContribution,call,fundingScheme,coordinator,coordinatorCountry,participants,participantCountries,subjects
0,71920,4182,AMIGO,,FP6-IST,IST-2002-2.3.2.3,,Amigo Ambient Intelligence for the networked h...,2004-09-01,2008-02-29,...,The networked home environment leads to many n...,2403487100,1296000000,FP6-2003-IST-2,IP,PHILIPS ELECTRONICS NEDERLAND B.V.,NL,SINGKIOULAR LOTZIK ANONYMI ETAIRIA PLIROFORIAK...,EL;NL;FR;IT;ES;DE;FI,IPS
1,85502,36495,GENRISK-T,,FP6-EURATOM-RADPROT,RAD PROT-2005/6-3.3.1.1-2,,Genetic component of the low dose risk of thyr...,2006-12-01,2010-09-30,...,Cancer of the non-medullary (follicular epithe...,4168377,2765453,EURATOM-2005-6-FIXEDDEADLINE,STREP,HELMHOLTZ ZENTRUM MUENCHEN DEUTSCHES FORSCHUNG...,DE,COMMISSARIAT A L'ENERGIE ATOMIQUE (CEA);UNIVER...,FR;UK;PL;BE;IT;ES;DE,BIO;RAD
2,74968,513944,EUROFIR,,FP6-FOOD,FOOD-2003-T2.1,,European food information resource network,2005-01-01,2010-06-30,...,EuroFIR will form a world-leading collaboratio...,13628583,12000000,FP6-2003-FOOD-2-A,NoE,INSTITUTE OF FOOD RESEARCH,UK,"UNIVERSITY OF LEEDS;UNIVERSITY COLLEGE CORK, N...",UK;IE;AT;DE;EL;BE;BG;ES;FI;FR;NO;SE;IL;IT;NL;P...,IPS;FOO
3,74155,506378,GA²LEN,,FP6-FOOD,FOOD-2002-T42,,Global allergy and asthma european network,2004-02-01,2010-01-31,...,Allergic diseases and asthma pose an important...,14400000,14,FP6-2002-FOOD-1,NoE,UNIVERSITEIT GENT,BE,LUDWIG-MAXIMILIANS-UNIVERSITAET MUENCHEN;INSTI...,DE;FR;ES;SE;NL;PL;CH;BE;NO;FI;UK;EL;PT;IT;AT;DK,SEA;LIF;MED;FOO;AGR
4,74297,506503,APROSYS,,FP6-SUSTDEV,,,Advanced Protection Systems (APROSYS),2004-04-01,2009-03-31,...,The IP on Advanced Protective Systems (APROSYS...,29962960,18000000,FP6-2002-TRANSPORT-1,IP,NEDERLANDSE ORGANISATIE VOOR TOEGEPAST NATUURW...,NL,"SIEMENS AG, SIEMENS VDO AUTOMATIVE GROUP;POLIT...",DE;IT;NL;ES;CZ;UK;PT;BE;FR;PL;SE;AT,


In [8]:
dataFP6 = pd.read_csv("dataset/cordis-fp6projects.csv", sep=";", converters = {'objective': str})
dataFP6.head(5)

Unnamed: 0,rcn,reference,acronym,status,programme,topics,frameworkProgramme,title,startDate,endDate,...,objective,totalCost,ecMaxContribution,call,fundingScheme,coordinator,coordinatorCountry,participants,participantCountries,subjects
0,71920,4182,AMIGO,,FP6-IST,IST-2002-2.3.2.3,,Amigo Ambient Intelligence for the networked h...,2004-09-01,2008-02-29,...,The networked home environment leads to many n...,2403487100,1296000000,FP6-2003-IST-2,IP,PHILIPS ELECTRONICS NEDERLAND B.V.,NL,SINGKIOULAR LOTZIK ANONYMI ETAIRIA PLIROFORIAK...,EL;NL;FR;IT;ES;DE;FI,IPS
1,85502,36495,GENRISK-T,,FP6-EURATOM-RADPROT,RAD PROT-2005/6-3.3.1.1-2,,Genetic component of the low dose risk of thyr...,2006-12-01,2010-09-30,...,Cancer of the non-medullary (follicular epithe...,4168377,2765453,EURATOM-2005-6-FIXEDDEADLINE,STREP,HELMHOLTZ ZENTRUM MUENCHEN DEUTSCHES FORSCHUNG...,DE,COMMISSARIAT A L'ENERGIE ATOMIQUE (CEA);UNIVER...,FR;UK;PL;BE;IT;ES;DE,BIO;RAD
2,74968,513944,EUROFIR,,FP6-FOOD,FOOD-2003-T2.1,,European food information resource network,2005-01-01,2010-06-30,...,EuroFIR will form a world-leading collaboratio...,13628583,12000000,FP6-2003-FOOD-2-A,NoE,INSTITUTE OF FOOD RESEARCH,UK,"UNIVERSITY OF LEEDS;UNIVERSITY COLLEGE CORK, N...",UK;IE;AT;DE;EL;BE;BG;ES;FI;FR;NO;SE;IL;IT;NL;P...,IPS;FOO
3,74155,506378,GA²LEN,,FP6-FOOD,FOOD-2002-T42,,Global allergy and asthma european network,2004-02-01,2010-01-31,...,Allergic diseases and asthma pose an important...,14400000,14,FP6-2002-FOOD-1,NoE,UNIVERSITEIT GENT,BE,LUDWIG-MAXIMILIANS-UNIVERSITAET MUENCHEN;INSTI...,DE;FR;ES;SE;NL;PL;CH;BE;NO;FI;UK;EL;PT;IT;AT;DK,SEA;LIF;MED;FOO;AGR
4,74297,506503,APROSYS,,FP6-SUSTDEV,,,Advanced Protection Systems (APROSYS),2004-04-01,2009-03-31,...,The IP on Advanced Protective Systems (APROSYS...,29962960,18000000,FP6-2002-TRANSPORT-1,IP,NEDERLANDSE ORGANISATIE VOOR TOEGEPAST NATUURW...,NL,"SIEMENS AG, SIEMENS VDO AUTOMATIVE GROUP;POLIT...",DE;IT;NL;ES;CZ;UK;PT;BE;FR;PL;SE;AT,


In [9]:
dataFP7 = pd.read_csv("dataset/cordis-fp7projects.csv", sep=";", converters = {'objective': str})
dataFP7.head(5)

Unnamed: 0,rcn,reference,acronym,status,programme,topics,frameworkProgramme,title,startDate,endDate,...,objective,totalCost,ecMaxContribution,call,fundingScheme,coordinator,coordinatorCountry,participants,participantCountries,subjects
0,110629,611218,ALFRED,,FP7-ICT,ICT-2013.5.1,FP7,ALFRED - Personal Interactive Assistant for In...,2013-10-01,2016-09-30,...,***Personal Interactive Assistant for Independ...,444074100,342357300,FP7-ICT-2013-10,CP,ASCORA GMBH,DE,TALKAMATIC AB;STICHTING NATIONAAL OUDERENFONDS...,SE;NL;DE;ES;FR,INF
1,104117,911409,TIBETMETH,ONG,FP7-PEOPLE,FP7-PEOPLE-2011-IIF,FP7,Microbial Biomarker Records in Tibetan Peats: ...,2015-10-01,2016-09-30,...,It is crucial to understand terrestrial microb...,15000,15000,FP7-PEOPLE-2011-IIF,MC-IIFR,NORTHWEST UNIVERSITY,CN,,,SCI
2,188177,629604,SMALL_MAM_RECOL,ONG,FP7-PEOPLE,FP7-PEOPLE-2013-IEF,FP7,Post-glacial recolonisation and Holocene anthr...,2014-05-05,2016-05-04,...,"At the end of last glaciation, ca. 15 000 cal....",1940466,1940466,FP7-PEOPLE-2013-IEF,MC-IEF,CENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE,FR,,,
3,188066,626947,MOMEFAST,CAN,FP7-PEOPLE,FP7-PEOPLE-2013-IEF,FP7,Molecular Mechanisms Employed by the Newly Ass...,2015-01-01,2016-12-31,...,Posttranscriptional gene regulation is an esse...,1619688,1619688,FP7-PEOPLE-2013-IEF,MC-IEF,EUROPEAN MOLECULAR BIOLOGY LABORATORY,DE,,,
4,187919,625253,RNF4 IN THE DDR,ONG,FP7-PEOPLE,FP7-PEOPLE-2013-IEF,FP7,Identifying the targets and mechanism of actio...,2014-07-01,2017-08-19,...,The Ubiquitin (UB) and SUMO modification pathw...,3092352,3092352,FP7-PEOPLE-2013-IEF,MC-IEF,UNIVERSITY OF DUNDEE,UK,,,


In [10]:
dataH2020 = pd.read_csv("dataset/cordis-h2020projects.csv", sep=";", converters = {'objective': str})
dataH2020.head(5)

Unnamed: 0,rcn,reference,acronym,status,programme,topics,frameworkProgramme,title,startDate,endDate,...,objective,totalCost,ecMaxContribution,call,fundingScheme,coordinator,coordinatorCountry,participants,participantCountries,subjects
0,193982,643052,C-CASCADES,SIGNED,H2020-EU.1.3.1.,MSCA-ITN-2014-ETN,H2020,Carbon Cascades from Land to Ocean in the Anth...,2015-01-01,2018-12-31,...,C-CASCADES will produce a new generation of yo...,312573348,312573348,H2020-MSCA-ITN-2014,MSCA-ITN-ETN,UNIVERSITE LIBRE DE BRUXELLES,BE,THE UNIVERSITY OF EXETER;UPPSALA UNIVERSITET;M...,UK;SE;DE;FR;NL,
1,193979,643045,WAKEUPCALL,SIGNED,H2020-EU.1.3.1.,MSCA-ITN-2014-EID,H2020,Applied mathematics for risk measures in finan...,2015-01-01,2018-12-31,...,The EID WAKEUPCALL has been set up with the kn...,152261712,152261712,H2020-MSCA-ITN-2014,MSCA-ITN-EID,STICHTING CENTRUM VOOR WISKUNDE EN INFORMATICA,NL,VORTECH BV;ANALISTAS FINANCIEROS INTERNACIONAL...,NL;ES;IT;UK,
2,193971,642963,BigStorage,SIGNED,H2020-EU.1.3.1.,MSCA-ITN-2014-ETN,H2020,BigStorage: Storage-based Convergence between ...,2015-01-01,2018-12-31,...,'The consortium of this European Training Netw...,380340792,380340792,H2020-MSCA-ITN-2014,MSCA-ITN-ETN,UNIVERSIDAD POLITECNICA DE MADRID,ES,JOHANNES GUTENBERG-UNIVERSITAT MAINZ;CA TECHNO...,DE;ES;UK;FR;EL,
3,193970,642961,PACE,SIGNED,H2020-EU.1.3.1.,MSCA-ITN-2014-ETN,H2020,Perception and Action in Complex Environments,2015-04-01,2019-03-31,...,The PACE research and training programme sits ...,391167324,391167324,H2020-MSCA-ITN-2014,MSCA-ITN-ETN,CENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE,FR,MOTEK MEDICAL B.V.;STICHTING VU-VUMC;NEDERLAND...,NL;IL;UK;IT;ES,
4,193969,642959,IODA,SIGNED,H2020-EU.1.3.1.,MSCA-ITN-2014-ETN,H2020,Industrial optimal design using adjoint CFD,2015-01-01,2018-12-31,...,Adjoint-based methods have become the most int...,385490952,385490952,H2020-MSCA-ITN-2014,MSCA-ITN-ETN,QUEEN MARY UNIVERSITY OF LONDON,UK,ESI SOFTWARE GERMANY GMBH;ROLLS-ROYCE DEUTSCHL...,DE;BE;EL;FR;UK;IT,


As only the FP4 data come in capitalised columns with multiple words, we'll turn them to lowercase and change spaces to underscores. We will also rename some columns, to be in tune with the other framework programs.

In [11]:
dataFP4.columns = map(str.lower, dataFP4.columns)
dataFP4.columns = map(lambda x: x.replace(' ', '_'), dataFP4.columns)
dataFP4.rename(columns = {
        'project_title': 'title',
        'objectives': 'objective',
        'subject': 'subjects' 
    }, inplace=True)
dataFP4.columns

Index(['rcn', 'title', 'start_date', 'end_date', 'duration', 'status',
       'contract_number', 'keywords', 'date_of_signature', 'total_cost',
       'total_funding', 'project_website', 'project_call', 'project_acronym',
       'general_information', 'achievements', 'objective', 'activity_area',
       'contract_type', 'subjects', 'framework_programme', 'pga',
       'coordinator_country', 'contractor_country'],
      dtype='object')

Now we'll get a view on the dataframe.

In [12]:
dataFP4['framework_programme'] = 'FP4'
df4 = dataFP4[['rcn', 'title', 'objective', 'subjects', 'framework_programme']]
df4.head()

Unnamed: 0,rcn,title,objective,subjects,framework_programme
0,29005,Spot IV-V?g?tation,,Environmental Protection; Forecasting; Meteoro...,FP4
1,30802,Formation and occurrence of nitrous acd in the...,%LTo understand the mechanisms leading to the ...,Environmental Protection; Forecasting; Measure...,FP4
2,31031,Process for Production of Light Olefins by Deh...,,Industrial Manufacture; Materials Technology,FP4
3,30803,High resolution diode laser carbon dioxide env...,%LTo develop a new instrument for measuring at...,Environmental Protection; Measurement Methods;...,FP4
4,31004,Subsurface Radar as a Tool for Non-destructive...,,Industrial Manufacture; Materials Technology; ...,FP4


Same with the FP5 data.

In [13]:
dataFP5.rename(columns = {
        'frameworkProgramme': 'framework_programme'
    }, inplace=True)
dataFP5['framework_programme'] = 'FP5'
df5 = dataFP5[['rcn', 'title', 'objective', 'subjects', 'framework_programme']]
df5.head()

Unnamed: 0,rcn,title,objective,subjects,framework_programme
0,64570,Genetic diversity in agriculture: temporal flu...,The overall objective of this project is to de...,ECO;SEA;LIF;ENV;AGR,FP5
1,64192,Sensing and controlling single molecules by no...,This project concerns controlling and sensing ...,BIO;LIF;ENV;MED;WAS;ITT,FP5
2,61977,Transduction mechanisms for non-noxious and no...,,,FP5
3,54932,Portable measurement systems for atmospheric p...,The primary objective of the proposed project ...,SEA;MET;ENV;FOR,FP5
4,56044,Benthic primary production - carbon cycling an...,,,FP5


Then with FP6 data.

In [14]:
dataFP6.rename(columns = {
        'frameworkProgramme': 'framework_programme'
    }, inplace=True)
dataFP6['framework_programme'] = 'FP6'
df6 = dataFP6[['rcn', 'title', 'objective', 'subjects', 'framework_programme']]
df6.head()

Unnamed: 0,rcn,title,objective,subjects,framework_programme
0,71920,Amigo Ambient Intelligence for the networked h...,The networked home environment leads to many n...,IPS,FP6
1,85502,Genetic component of the low dose risk of thyr...,Cancer of the non-medullary (follicular epithe...,BIO;RAD,FP6
2,74968,European food information resource network,EuroFIR will form a world-leading collaboratio...,IPS;FOO,FP6
3,74155,Global allergy and asthma european network,Allergic diseases and asthma pose an important...,SEA;LIF;MED;FOO;AGR,FP6
4,74297,Advanced Protection Systems (APROSYS),The IP on Advanced Protective Systems (APROSYS...,,FP6


And with FP7.

In [15]:
dataFP7.rename(columns = {
        'frameworkProgramme': 'framework_programme'
    }, inplace=True)
dataFP7['framework_programme'] = 'FP7'
df7 = dataFP7[['rcn', 'title', 'objective', 'subjects', 'framework_programme']]
df7.head()

Unnamed: 0,rcn,title,objective,subjects,framework_programme
0,110629,ALFRED - Personal Interactive Assistant for In...,***Personal Interactive Assistant for Independ...,INF,FP7
1,104117,Microbial Biomarker Records in Tibetan Peats: ...,It is crucial to understand terrestrial microb...,SCI,FP7
2,188177,Post-glacial recolonisation and Holocene anthr...,"At the end of last glaciation, ca. 15 000 cal....",,FP7
3,188066,Molecular Mechanisms Employed by the Newly Ass...,Posttranscriptional gene regulation is an esse...,,FP7
4,187919,Identifying the targets and mechanism of actio...,The Ubiquitin (UB) and SUMO modification pathw...,,FP7


And finally with H2020.

In [16]:
dataH2020.rename(columns = {
        'frameworkProgramme': 'framework_programme'
    }, inplace=True)
dataH2020['framework_programme'] = 'H2020'
df20 = dataH2020[['rcn', 'title', 'objective', 'subjects', 'framework_programme']]
df20.head()

Unnamed: 0,rcn,title,objective,subjects,framework_programme
0,193982,Carbon Cascades from Land to Ocean in the Anth...,C-CASCADES will produce a new generation of yo...,,H2020
1,193979,Applied mathematics for risk measures in finan...,The EID WAKEUPCALL has been set up with the kn...,,H2020
2,193971,BigStorage: Storage-based Convergence between ...,'The consortium of this European Training Netw...,,H2020
3,193970,Perception and Action in Complex Environments,The PACE research and training programme sits ...,,H2020
4,193969,Industrial optimal design using adjoint CFD,Adjoint-based methods have become the most int...,,H2020


# Getting all Objectives

We will get all the objectives from all the projects of all the calls and place them in a single column.

In [18]:
all_objectives = pd.concat([ df['objective'] for df in [df4, df5, df6, df7, df20] ])
all_objectives.shape

(76522,)

In [19]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import string

tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()


In [20]:
# Remove common words
en_stop.extend(['will', 'develop', 'project', 'research', 'new', 'use', 'european'])

texts_1 = []

for text in all_objectives:
    
    text = text.lower()
    
    # Remove punctuation & numbers
    exclude_punctuation = list(string.punctuation)
    numbers = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0']
    exclude_punctuation.extend(numbers)
    
    if text != 'nan':
        text = text.replace('%l', '')
        for c in exclude_punctuation:
            text = text.replace(c, " ")
        
        # tokenize document string
        tokens = tokenizer.tokenize(text)
        
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        
        stopped_tokens = [i for i in stopped_tokens if not i in ['develop', 'understand', 'model', 'eu', 'europe']]
        
        # stem tokens
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        
        # add tokens to list
        texts_1.append(stemmed_tokens)

KeyboardInterrupt: 

In [88]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts_1)

NameError: name 'corpora' is not defined

In [14]:
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts_1]

In [15]:
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, id2word = dictionary)

print(ldamodel.print_topics(num_topics=5, num_words=5))

[(0, u'0.014*product + 0.012*energi + 0.012*market + 0.009*technolog + 0.008*develop'), (1, u'0.009*innov + 0.007*research + 0.007*develop + 0.006*train + 0.006*activ'), (2, u'0.010*studi + 0.006*theori + 0.006*chang + 0.006*propos + 0.006*model'), (3, u'0.014*system + 0.009*technolog + 0.008*develop + 0.008*applic + 0.007*base'), (4, u'0.015*cell + 0.007*function + 0.006*diseas + 0.006*studi + 0.006*use')]
