In [2]:
import pandas as pd
import numpy as np
import re
import gensim
# import stop_words

from gensim import corpora
from gensim import models
from gensim.corpora.dictionary import Dictionary

from gensim.parsing.preprocessing import STOPWORDS

from time import time

import string

import logging
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)

# Reading the data

The data is stored in the following files, one per each framework programme (the links point to the original sources in the [European Union Open Data Portal](https://data.europa.eu/euodp/en/data/):
  * [`EU_raw_data/cordisfp4complete.csv`](http://cordis.europa.eu/data/cordisfp4complete.csv)
  * [`EU_raw_data/cordis-fp5projects.csv`](http://cordis.europa.eu/data/cordis-fp5projects.csv)
  * [`EU_raw_data/cordis-fp6projects.csv`](http://cordis.europa.eu/data/cordis-fp6projects.csv)
  * [`EU_raw_data/cordis-fp7projects.csv`](http://cordis.europa.eu/data/cordis-fp7projects.csv)
  * [`EU_raw_data/cordis-h2020projects.csv`](http://cordis.europa.eu/data/cordis-h2020projects.csv)
  
We read each one of them in turn. Note that we will be using the objectives column, which has several empty values, so we have to ensure that it is treated as string.

In [5]:
dataFP4 = pd.read_csv("EU_raw_data/cordisfp4complete.csv", sep=";", converters = {'Objectives': str})
dataFP4.head(2)

Unnamed: 0,rcn,title,Start Date,End Date,Duration,Status,Contract Number,Keywords,Date of Signature,Total Cost,...,General Information,Achievements,objective,Activity Area,Contract Type,Subject,Framework Programme,PGA,Coordinator Country,Contractor Country
0,29005,Spot IV-V?g?tation,01/04/1995,31/07/1997,,Completed,ENV4950001,,,,...,,,,Research and development work for potential fu...,CSC - Cost-sharing contracts,Environmental Protection; Forecasting; Meteoro...,Fourth Framework Programme,FP4-ENV 2C,FR,
1,30802,Formation and occurrence of nitrous acd in the...,01/02/1996,31/07/1998,,Completed,ENV4950055,,,,...,%L Nitrous acid is of particular importance in...,,%LTo understand the mechanisms leading to the ...,Tropospheric physics and chemistry,CSC - Cost-sharing contracts,Environmental Protection; Forecasting; Measure...,Fourth Framework Programme,FP4-ENV 2C,IT,CH; DE; DE; DE; DE; DK; FR; GB; GR


In [6]:
dataFP5 = pd.read_csv("EU_raw_data/cordis-fp5projects.csv", sep=";", converters = {'objective': str})
dataFP5.head(2)

Unnamed: 0,rcn,id,acronym,status,programme,topics,frameworkProgramme,title,startDate,endDate,...,objective,totalCost,ecMaxContribution,call,fundingScheme,coordinator,coordinatorCountry,participants,participantCountries,subjects
0,64570,QLK5-CT-2001-00934,GEDIFLUX,,FP5-LIFE QUALITY,1.1.1.-5.,FP5-LIFE QUALITY,Genetic diversity in agriculture: temporal flu...,2001-10-01,2004-09-30,...,The overall objective of this project is to de...,3039812.0,1846635.0,,CSC,NIAB,UK,JOHN INNES CENTRE;INSTITUTE OF PLANT GENETICS ...,UK;DE;FR;NL,ECO;SEA;LIF;ENV;AGR
1,64192,QLK3-CT-2001-00278,NANOCELL,,FP5-LIFE QUALITY,1.1.1.-3.,FP5-LIFE QUALITY,Sensing and controlling single molecules by no...,2002-01-01,2004-12-31,...,This project concerns controlling and sensing ...,2633658.0,1853271.0,,CSC,GOETEBORG UNIVERSITY,SE,UNIVERSITY OF GLASGOW;FRAUNHOFER IAF;LGC LIMIT...,UK;DE;SE;CH,BIO;LIF;ENV;MED;WAS;ITT


In [7]:
dataFP6 = pd.read_csv("EU_raw_data/cordis-fp6projects.csv", sep=";", converters = {'objective': str})
dataFP6.head(2)

Unnamed: 0,rcn,reference,acronym,status,programme,topics,frameworkProgramme,title,startDate,endDate,...,objective,totalCost,ecMaxContribution,call,fundingScheme,coordinator,coordinatorCountry,participants,participantCountries,subjects
0,71920,4182,AMIGO,,FP6-IST,IST-2002-2.3.2.3,,Amigo Ambient Intelligence for the networked h...,2004-09-01,2008-02-29,...,The networked home environment leads to many n...,2403487100,1296000000,FP6-2003-IST-2,IP,PHILIPS ELECTRONICS NEDERLAND B.V.,NL,SINGKIOULAR LOTZIK ANONYMI ETAIRIA PLIROFORIAK...,EL;NL;FR;IT;ES;DE;FI,IPS
1,85502,36495,GENRISK-T,,FP6-EURATOM-RADPROT,RAD PROT-2005/6-3.3.1.1-2,,Genetic component of the low dose risk of thyr...,2006-12-01,2010-09-30,...,Cancer of the non-medullary (follicular epithe...,4168377,2765453,EURATOM-2005-6-FIXEDDEADLINE,STREP,HELMHOLTZ ZENTRUM MUENCHEN DEUTSCHES FORSCHUNG...,DE,COMMISSARIAT A L'ENERGIE ATOMIQUE (CEA);UNIVER...,FR;UK;PL;BE;IT;ES;DE,BIO;RAD


In [8]:
dataFP7 = pd.read_csv("EU_raw_data/cordis-fp7projects.csv", sep=";", converters = {'objective': str})
dataFP7.head(2)

Unnamed: 0,rcn,reference,acronym,status,programme,topics,frameworkProgramme,title,startDate,endDate,...,objective,totalCost,ecMaxContribution,call,fundingScheme,coordinator,coordinatorCountry,participants,participantCountries,subjects
0,110629,611218,ALFRED,,FP7-ICT,ICT-2013.5.1,FP7,ALFRED - Personal Interactive Assistant for In...,2013-10-01,2016-09-30,...,***Personal Interactive Assistant for Independ...,444074100,342357300,FP7-ICT-2013-10,CP,ASCORA GMBH,DE,TALKAMATIC AB;STICHTING NATIONAAL OUDERENFONDS...,SE;NL;DE;ES;FR,INF
1,104117,911409,TIBETMETH,ONG,FP7-PEOPLE,FP7-PEOPLE-2011-IIF,FP7,Microbial Biomarker Records in Tibetan Peats: ...,2015-10-01,2016-09-30,...,It is crucial to understand terrestrial microb...,15000,15000,FP7-PEOPLE-2011-IIF,MC-IIFR,NORTHWEST UNIVERSITY,CN,,,SCI


In [9]:
dataH2020 = pd.read_csv("EU_raw_data/cordis-h2020projects.csv", sep=";", converters = {'objective': str})
dataH2020.head(2)

Unnamed: 0,rcn,reference,acronym,status,programme,topics,frameworkProgramme,title,startDate,endDate,...,objective,totalCost,ecMaxContribution,call,fundingScheme,coordinator,coordinatorCountry,participants,participantCountries,subjects
0,193982,643052,C-CASCADES,SIGNED,H2020-EU.1.3.1.,MSCA-ITN-2014-ETN,H2020,Carbon Cascades from Land to Ocean in the Anth...,2015-01-01,2018-12-31,...,C-CASCADES will produce a new generation of yo...,312573348,312573348,H2020-MSCA-ITN-2014,MSCA-ITN-ETN,UNIVERSITE LIBRE DE BRUXELLES,BE,THE UNIVERSITY OF EXETER;UPPSALA UNIVERSITET;M...,UK;SE;DE;FR;NL,
1,193979,643045,WAKEUPCALL,SIGNED,H2020-EU.1.3.1.,MSCA-ITN-2014-EID,H2020,Applied mathematics for risk measures in finan...,2015-01-01,2018-12-31,...,The EID WAKEUPCALL has been set up with the kn...,152261712,152261712,H2020-MSCA-ITN-2014,MSCA-ITN-EID,STICHTING CENTRUM VOOR WISKUNDE EN INFORMATICA,NL,VORTECH BV;ANALISTAS FINANCIEROS INTERNACIONAL...,NL;ES;IT;UK,


As only the FP4 data come in capitalised columns with multiple words, we'll turn them to lowercase and change spaces to underscores. We will also rename some columns, to be in tune with the other framework programs.

In [10]:
dataFP4.columns = map(str.lower, dataFP4.columns)
dataFP4.columns = map(lambda x: x.replace(' ', '_'), dataFP4.columns)
dataFP4.rename(columns = {
        'project_title': 'title',
        'objectives': 'objective',
        'subject': 'subjects' 
    }, inplace=True)
dataFP4.columns

Index([u'rcn', u'title', u'start_date', u'end_date', u'duration', u'status',
       u'contract_number', u'keywords', u'date_of_signature', u'total_cost',
       u'total_funding', u'project_website', u'project_call',
       u'project_acronym', u'general_information', u'achievements',
       u'objective', u'activity_area', u'contract_type', u'subjects',
       u'framework_programme', u'pga', u'coordinator_country',
       u'contractor_country'],
      dtype='object')

Now we'll get a view on the dataframe.

In [11]:
dataFP4['framework_programme'] = 'FP4'
df4 = dataFP4[['rcn', 'title', 'objective', 'subjects', 'framework_programme']]
df4.to_csv("dataset/euFP4.csv", sep = ';', encoding='utf-8')
df4.head()

Unnamed: 0,rcn,title,objective,subjects,framework_programme
0,29005,Spot IV-V?g?tation,,Environmental Protection; Forecasting; Meteoro...,FP4
1,30802,Formation and occurrence of nitrous acd in the...,%LTo understand the mechanisms leading to the ...,Environmental Protection; Forecasting; Measure...,FP4
2,31031,Process for Production of Light Olefins by Deh...,,Industrial Manufacture; Materials Technology,FP4
3,30803,High resolution diode laser carbon dioxide env...,%LTo develop a new instrument for measuring at...,Environmental Protection; Measurement Methods;...,FP4
4,31004,Subsurface Radar as a Tool for Non-destructive...,,Industrial Manufacture; Materials Technology; ...,FP4


Same with the FP5 data.

In [12]:
dataFP5.rename(columns = {
        'frameworkProgramme': 'framework_programme'
    }, inplace=True)
dataFP5['framework_programme'] = 'FP5'
df5 = dataFP5[['rcn', 'title', 'objective', 'subjects', 'framework_programme']]
df5.to_csv("dataset/euFP5.csv", sep = ';', encoding='utf-8')
df5.head()

Unnamed: 0,rcn,title,objective,subjects,framework_programme
0,64570,Genetic diversity in agriculture: temporal flu...,The overall objective of this project is to de...,ECO;SEA;LIF;ENV;AGR,FP5
1,64192,Sensing and controlling single molecules by no...,This project concerns controlling and sensing ...,BIO;LIF;ENV;MED;WAS;ITT,FP5
2,61977,Transduction mechanisms for non-noxious and no...,,,FP5
3,54932,Portable measurement systems for atmospheric p...,The primary objective of the proposed project ...,SEA;MET;ENV;FOR,FP5
4,56044,Benthic primary production - carbon cycling an...,,,FP5


Then with FP6 data.

In [13]:
dataFP6.rename(columns = {
        'frameworkProgramme': 'framework_programme'
    }, inplace=True)
dataFP6['framework_programme'] = 'FP6'
df6 = dataFP6[['rcn', 'title', 'objective', 'subjects', 'framework_programme']]
df6.to_csv("dataset/euFP6.csv", sep = ';', encoding='utf-8')
df6.head()

Unnamed: 0,rcn,title,objective,subjects,framework_programme
0,71920,Amigo Ambient Intelligence for the networked h...,The networked home environment leads to many n...,IPS,FP6
1,85502,Genetic component of the low dose risk of thyr...,Cancer of the non-medullary (follicular epithe...,BIO;RAD,FP6
2,74968,European food information resource network,EuroFIR will form a world-leading collaboratio...,IPS;FOO,FP6
3,74155,Global allergy and asthma european network,Allergic diseases and asthma pose an important...,SEA;LIF;MED;FOO;AGR,FP6
4,74297,Advanced Protection Systems (APROSYS),The IP on Advanced Protective Systems (APROSYS...,,FP6


And with FP7.

In [14]:
dataFP7.rename(columns = {
        'frameworkProgramme': 'framework_programme'
    }, inplace=True)
dataFP7['framework_programme'] = 'FP7'
df7 = dataFP7[['rcn', 'title', 'objective', 'subjects', 'framework_programme']]
df7.to_csv("dataset/euFP7.csv", sep = ';', encoding='utf-8')
df7.head()

Unnamed: 0,rcn,title,objective,subjects,framework_programme
0,110629,ALFRED - Personal Interactive Assistant for In...,***Personal Interactive Assistant for Independ...,INF,FP7
1,104117,Microbial Biomarker Records in Tibetan Peats: ...,It is crucial to understand terrestrial microb...,SCI,FP7
2,188177,Post-glacial recolonisation and Holocene anthr...,"At the end of last glaciation, ca. 15 000 cal....",,FP7
3,188066,Molecular Mechanisms Employed by the Newly Ass...,Posttranscriptional gene regulation is an esse...,,FP7
4,187919,Identifying the targets and mechanism of actio...,The Ubiquitin (UB) and SUMO modification pathw...,,FP7


And finally with H2020.

In [15]:
dataH2020.rename(columns = {
        'frameworkProgramme': 'framework_programme'
    }, inplace=True)
dataH2020['framework_programme'] = 'H2020'
df20 = dataH2020[['rcn', 'title', 'objective', 'subjects', 'framework_programme']]
df20.to_csv("dataset/euH2020.csv", sep = ';', encoding='utf-8')
df20.head()

Unnamed: 0,rcn,title,objective,subjects,framework_programme
0,193982,Carbon Cascades from Land to Ocean in the Anth...,C-CASCADES will produce a new generation of yo...,,H2020
1,193979,Applied mathematics for risk measures in finan...,The EID WAKEUPCALL has been set up with the kn...,,H2020
2,193971,BigStorage: Storage-based Convergence between ...,'The consortium of this European Training Netw...,,H2020
3,193970,Perception and Action in Complex Environments,The PACE research and training programme sits ...,,H2020
4,193969,Industrial optimal design using adjoint CFD,Adjoint-based methods have become the most int...,,H2020
