In [140]:
import pandas as pd
import pickle
import requests
import sys
sys.path.append('../src')
from utils import *

In [141]:
GOOGLE_MAPS_API_URL = 'https://maps.googleapis.com/maps/api/geocode/json?'

def request_url(url):
    """Request access to a URL. The user agent was modified because the default ("Python-urllib/3.5")
    is usually blocked.

    Args:
        url (str): It should contain a valid URL.

    Return:
        Request object.

    """

    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11'}, 
                            verify=False, timeout=30) #, 
    return response.text

def geolocate(data):
    locations = {}
    failed_calls = []

    for id_, postcode in data.items():
        geodata = {}
        try:
            # define the parameters of the search
            params = {
            'address': '{}'.format(postcode),
            'key': 'AIzaSyBmhFBoM_Urrm3YBcpcvpYhxe5Jj5SnxIw'
            }

            # Do the request and get the response data
            req = requests.get(GOOGLE_MAPS_API_URL, params=params)
            res = req.json()

            # Use the first result
            result = res['results'][0]

            # store these attributes
            geodata = dict()
            geodata['lat'] = result['geometry']['location']['lat']
            geodata['lng'] = result['geometry']['location']['lng']
            geodata['address'] = result['formatted_address']

            for output in result['address_components']:
                if output['types'][0] == 'postal_town':
                    geodata['postal_town'] = output['long_name']
                elif output['types'][0] == 'administrative_area_level_2':
                    geodata['administrative_area_level_2'] = output['long_name']
                elif output['types'][0] == 'administrative_area_level_1':
                    geodata['administrative_area_level_1'] = output['long_name']
                elif output['types'][0] == 'country':
                    geodata['country'] = output['long_name']
                elif output['types'][0] == 'route':
                    geodata['route'] = output['long_name']
                else:
                    continue

            locations[id_] = geodata
        except Exception as e:
#             print(e)
#             print(postcode)
            failed_calls.append(postcode)
    return locations, failed_calls

In [142]:
sr = pd.read_csv('../data/interim/df_search_results.csv')

The datasets will be geolocated in a different way because of the available input (address, postcodes etc) so they require a bit of wrangling.

## H2020 data

In [143]:
participants = flatten_lists([name.split(';') for name in sr['Participant Name'] if type(name)==str])

In [144]:
# Cordis data with all the participants
cordis_org = pd.read_csv('../data/raw/cordis-h2020organizations.csv', sep=';')
cordis_org = cordis_org[(cordis_org.street.isnull()==False) 
                        & (cordis_org.city.isnull()==False) 
                        & (cordis_org.country.isnull()==False)]

# keep subset with participants
cordis_org = cordis_org[cordis_org.name.isin(participants)]
cordis_org['project_id'] = cordis_org.projectID.apply(lambda x: str(x))
cordis_org.reset_index(inplace=True, drop=True)

In [145]:
cordis_org.shape

(9186, 24)

In [146]:
# queries for h2020
h2020_queries = {cordis_org.loc[i,'id']:' '.join([cordis_org.loc[i, 'country'], cordis_org.loc[i, 'street'], cordis_org.loc[i, 'city']]) for i in range(cordis_org.shape[0])}

In [147]:
locations, failed_calls = geolocate(h2020_queries)

In [68]:
location_data = pd.DataFrame.from_dict(locations, orient='index')
location_data.reset_index(inplace=True)
location_data.rename(index=str, inplace=True, columns={'index':'id'})

In [69]:
# merge organisations from cordis with location data
geolocated_cordis = cordis_org.merge(location_data, left_on='id', right_on='id')

## Geolocate InnovateUK

In [70]:
sr_innovateuk = sr[sr.dataset_id == 'innovateuk']
sr_innovateuk.reset_index(inplace=True, drop=True)

In [72]:
# queries for h2020
innovateuk_queries = {sr_innovateuk.loc[i,'Participant Name']:''.join([sr_innovateuk.loc[i, 'Postcode']]) 
                      for i in range(sr_innovateuk.shape[0])}

In [74]:
location_data_iuk, failed_calls_iuk = geolocate(innovateuk_queries)

In [75]:
location_data_iuk = pd.DataFrame.from_dict(location_data_iuk, orient='index')
location_data_iuk.reset_index(inplace=True)
location_data_iuk.rename(index=str, inplace=True, columns={'index':'id'})

In [76]:
geolocated_innovateuk = sr_innovateuk.merge(location_data_iuk, left_on='Participant Name', right_on='id')

## Geolocate GtR

In [120]:
with open('../data/raw/org_lat_lng_dict.p', 'rb') as h:
    gtr_lat_lon = pickle.load(h)

In [121]:
gtr_lat_lon = pd.DataFrame.from_dict(gtr_lat_lon, orient='index')
gtr_lat_lon.reset_index(inplace=True)
gtr_lat_lon.rename(index=str, inplace=True, columns={'index':'name'})

# keep only GtR data
sr_gtr = sr[sr.dataset_id == 'gtr']
sr_gtr.reset_index(inplace=True, drop=True)

sr_gtr = sr_gtr.merge(gtr_lat_lon, left_on='Participant Name', right_on='name')

## Bring them together

In [125]:
# Final wrangling for H2020
sr_h2020 = sr[sr.dataset_id=='H2020']
sr_h2020 = sr_h2020.merge(geolocated_cordis, left_on='project_id', right_on='project_id')
sr_h2020 = sr_h2020[['Participant Name', 'Grant Offered (£)', 'Project Start Date', 
                   'Project Title', 'Public Description', 'project_id', 'dataset_id', 'lat', 'lng', 'paragraph vectors']]

In [128]:
df_geo_results = pd.concat([
    sr_gtr,
    geolocated_innovateuk[['lng', 'lat', 'Public Description', 'Project Start Date', 'paragraph vectors',
                       'Participant Name', 'Grant Offered (£)', 'dataset_id', 'project_id']],
    sr_h2020], axis=0)

In [129]:
df_geo_results.shape

(856, 16)

In [131]:
df_geo_results.to_csv('../data/processed/df_geo_results.csv')