In [1]:
import pickle
import requests
import pandas as pd
import numpy as np

In [2]:
def request_url(url):
    """Request access to a URL. The user agent was modified because the default ("Python-urllib/3.5")
    is usually blocked.

    Args:
        url (str): It should contain a valid URL.

    Return:
        Request object.

    """

    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11'}, 
                            verify=False, timeout=30) #, 
    return response.text

# Read data

Reading three data sources:

1. Gateway to Research
2. InnovateUK
3. H2020

The goal is to preprocess these datasets, keep only the instances with abstracts and reduce the feature space to funding, names, dates and descriptions.

## GtR data

In [3]:
gtr_data = pd.read_csv('../data/raw/gtr_projects_orgs.csv')

# Keep unique projects for modelling
gtr_data.drop_duplicates('project_id', inplace=True)

gtr_data = gtr_data[gtr_data.abstract_texts.isnull() == False]

# Drop topic columns that were generated by a previous analysis
gtr_data.drop([c for c in gtr_data.columns if c.startswith('tc')], axis=1, inplace=True)
gtr_data.drop(['Unnamed: 0', 'Unnamed: 0.1', 'is_wales'], axis=1, inplace=True)

# Create a dataset ID
gtr_data['dataset_id'] = 'gtr'
gtr_data.reset_index(inplace=True, drop=True)
gtr_data.set_index('project_id', inplace=True)

In [4]:
gtr_data.shape

(42572, 23)

GtR data did not have project titles so we used its API to collect them 

**WARNING**: takes quite a bit of time to run this.

In [5]:
# d = {}
# c = 1
# for id_ in gtr_data.index:
#     r = request_url(id_)
#     try:
#         d[id_] = re.findall('<ns2:title>(.*?)</ns2:title>', r)[0]
#     except Exception as e:
#         print(id_, e)
#     c += 1

In [6]:
# import pickle
# with open('../data/raw/gtr_titles.pickle', 'wb') as h:
#     pickle.dump(d, h)

In [7]:
with open('../data/raw/gtr_titles.pickle', 'rb') as h:
    d = pickle.load(h)

In [8]:
# Merge titles with GtR projects
gtr_data = gtr_data.merge(pd.DataFrame.from_dict(d, orient='index'), left_index=True, right_index=True)
gtr_data.reset_index(inplace=True)

gtr_data.rename(index=str, inplace=True, columns={0:'Project Title', 
                                                  'value_pounds':'Grant Offered (£)', 
                                                  'abstract_texts':'Public Description', 
                                                  'start_year':'Project Start Date', 'level_0':'project_id', 
                                                  'name':'Participant Name'})

## InnovateUK data

In [15]:
# InnovateUK
innovateuk = pd.read_excel('../data/raw/Innovate_UK_funded_projects_from_2004_to_1_March_2018.xlsx')
innovateuk.drop_duplicates('Project Number', inplace=True)
innovateuk['dataset_id'] = 'innovateuk'
innovateuk.rename(index=str, inplace=True, columns={'Project Number':'project_id'})

## H2020 data

In [16]:
h2020 = pd.read_csv('../data/raw/cordis-h2020projects.csv', sep=';')
h2020.rename(index=str, inplace=True, columns={'participants':'Participant Name', 
                                               'totalCost':'Grant Offered (£)', 
                                               'startDate':'Project Start Date', 
                                               'title':'Project Title',
                                               'objective':'Public Description',
                                               'id':'project_id'})
h2020['dataset_id'] = 'H2020'
h2020 = h2020[['Participant Name', 'Grant Offered (£)', 'Project Start Date', 
               'Project Title', 'Public Description', 'project_id', 'dataset_id']]

## Merging the datasets

In [17]:
df = pd.concat([
    gtr_data[['Participant Name', 'Grant Offered (£)', 'Public Description', 'Project Title', 'Project Start Date', 'LAD13NM', 'dataset_id', 'project_id']],
    innovateuk[['Project Title', 'Public Description', 'Project Start Date', 'Participant Name', 'Postcode', 'Grant Offered (£)', 'dataset_id', 'project_id']],
    h2020], axis=0)

In [18]:
df.shape

(74861, 9)

In [19]:
df.isnull().sum()

Grant Offered (£)       959
LAD13NM               32302
Participant Name       9482
Postcode              57414
Project Start Date      949
Project Title             0
Public Description        0
dataset_id                0
project_id                0
dtype: int64

In [20]:
df.reset_index(inplace=True, drop=True)

In [21]:
df.to_csv('../data/interim/research_grants.csv')