# Funded Data Extraction for UKRI

In [1]:
import pymysql
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import requests
import re
from tqdm import tqdm

In [2]:
class MySQLPipline(object):
    """ Define Class for MySQL Connection"""
    def __init__(self):
        """ Initialize object """
        self.conn = pymysql.connect(
            host = 'localhost', port = 3306, user = 'root', passwd = 'Kyle9975', db = 'funding', charset = 'utf8')
        self.conn.autocommit(True)
        self.cursor = self.conn.cursor()

    def process_Query(self, sql, colnames):
        """ Processing the SQL query"""
        sql = sql
        self.cursor.execute(sql)
        data = self.cursor.fetchall()
        data = pd.DataFrame(data)
        data.columns = colnames
        return data
    
    def INSERT_Query(self, item):
        sql = 'INSERT INTO UKRI_Funded_ALL_Raw (TITLE, ProRef, OrgName, sDate, eDate, institution, department, projType, PIFirstName, PISurname, Amount, url, Abstract, resSub, resTopic) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);'
        iteminfo = [item['title'][0], item['proRef'][0], item['orgName'][0], item['sDate'][0], item['eDate'][0], item['institution'][0], item['department'][0], item['projType'][0], item["PIFirstName"][0], item["PISurname"][0], item["Amount"][0], item["url"][0], item["abstract"], str(item["resSub"]), str(item['resTopic'])]
        self.cursor.execute(sql, iteminfo)
        return item

    def close_Conn(self):
        """ Closing Connection """
        self.cursor.close()
        self.conn.close()

In [3]:
import string
def remove_punctuation(text):
    """ Removing punctuation """
    try: # python 2.x
        text = text.translate(None, string.punctuation) 
    except: # python 3.x
        translator = text.maketrans('', '', string.punctuation)
        text = text.translate(translator)
    return text

# Data Scraping

In [4]:
def getAbstract(row):
    """ Get Abstract for Single Funded Project """
    dic = row.to_dict('list')
    ref = dic["proRef"][0]
    
    while True:
        try:
            response = requests.get("https://gtr.ukri.org/projects?ref={}".format(ref),
            headers = {
                'user-agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
            })
            break
        except:
            pass
    
    absPattern = re.compile("<gtr:abstractText>(.*?)</gtr:abstractText>", re.S)
    resAbs = re.findall(absPattern, response.text)
#     print(resAbs)
    dic["abstract"] = resAbs[0] if len(resAbs) != 0 else None
    
    subPattern = re.compile("<gtr:researchSubject>.*?<gtr:text>(.*?)</gtr:text>.*?</gtr:researchSubject>", re.S)
    resSub = re.findall(subPattern, response.text)
    dic["resSub"] = resSub if len(resSub) != 0 else None
    
    topicPattern = re.compile("<gtr:researchTopic>.*?<gtr:text>(.*?)</gtr:text>.*?</gtr:researchTopic>", re.S)
    resTopic = re.findall(topicPattern, response.text)
    
    if len(resTopic) == 0:
        resTopic = None
    else:
        if resTopic[0] == "Unclassified":
            resTopic = resTopic[0]
            
    
    dic["resTopic"] = resTopic
    
    UKRI = MySQLPipline()
    UKRI.INSERT_Query(dic)
    UKRI.close_Conn()
    
    return dic

In [5]:
UKRI = MySQLPipline()
sql = """
SELECT
    Title,
    ProjectReference,
    FundingOrgName,
    StartDate,
    EndDate,
    LeadROName,
    Department,
    ProjectCategory,
    PIFirstName,
    PISurname,
    AwardPounds,
    GTRProjectUrl
FROM
    FundedDataUKRI 
WHERE
    AwardPounds > 0;
"""
UKRIdata = UKRI.process_Query(sql, colnames = ["title", "proRef", "orgName", "sDate", "eDate", "institution", "department",
                                                 "projType", "PIFirstName", "PISurname", "Amount", "url"])
UKRI.close_Conn()

In [6]:
UKRIdata

Unnamed: 0,title,proRef,orgName,sDate,eDate,institution,department,projType,PIFirstName,PISurname,Amount,url
0,Construction Site Monitoring,750980,Innovate UK,01/10/2013,30/04/2014,Suave Uav Enterprises Ltd,,Unknown,,,5000.0,https://gtr.ukri.org:443/projects?ref=750980
1,Proton-driven plasma wakefield acceleration---...,ST/K002244/1,STFC,01/10/2012,30/09/2015,University College London,Physics and Astronomy,Research Grant,Matthew,Wing,26776.0,https://gtr.ukri.org:443/projects?ref=ST/K0022...
2,Mechanisms mediating axon outgrowth in the Dro...,BB/K002031/1,BBSRC,07/01/2013,06/10/2016,King's College London,Developmental Neurobiology,Research Grant,Guy Justin Clive,Tear,419504.0,https://gtr.ukri.org:443/projects?ref=BB/K0020...
3,"Ultrahigh resolution NMR: citius, altius, fortius",EP/N033949/1,EPSRC,01/08/2016,31/07/2019,University of Manchester,Chemistry,Research Grant,Gareth,Morris,520012.0,https://gtr.ukri.org:443/projects?ref=EP/N0339...
4,Project COMVIDIA: COrrelated Movement and Vide...,131825,Innovate UK,01/04/2015,31/03/2016,Iproov Limited,,Feasibility Studies,Andrew,Bud,108834.0,https://gtr.ukri.org:443/projects?ref=131825
...,...,...,...,...,...,...,...,...,...,...,...,...
90921,Bbsrc next generation bioprocessing studentshi...,BB/F016077/1,BBSRC,01/10/2008,30/09/2012,University College London,Biochemical Engineering,Training Grant,,,241620.0,https://gtr.ukri.org:443/projects?ref=BB/F0160...
90922,UK Involvement in LSST: Phase A,ST/N00258X/1,STFC,01/10/2016,31/03/2019,University College London,Physics and Astronomy,Research Grant,Hiranya,Peiris,147388.0,https://gtr.ukri.org:443/projects?ref=ST/N0025...
90923,Low-mass Support Structures for Silicon Detectors,ST/G008280/1,STFC,01/02/2009,31/03/2011,University of Liverpool,Physics,Research Grant,Timothy,Greenshaw,24141.0,https://gtr.ukri.org:443/projects?ref=ST/G0082...
90924,University of Ulster and Rapid International L...,510831,Innovate UK,01/04/2017,31/03/2019,University of Ulster,,Knowledge Transfer Partnership,,,80236.0,https://gtr.ukri.org:443/projects?ref=510831


In [7]:
for i in tqdm(range(UKRIdata.shape[0])):
    getAbstract(UKRIdata[UKRIdata.index == i])

100%|██████████| 90926/90926 [2:19:48<00:00, 10.84it/s]  
