# Funded Data Extraction for ERC

In [1]:
import pymysql
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import requests
import re
from tqdm import tqdm

In [2]:
class MySQLPipline(object):
    """ Define Class for MySQL Connection"""
    def __init__(self):
        """ Initialize object """
        self.conn = pymysql.connect(
            host = 'localhost', port = 3306, user = 'root', passwd = 'Kyle9975', db = 'funding', charset = 'utf8')
        self.conn.autocommit(True)
        self.cursor = self.conn.cursor()

    def process_Query(self, sql, colnames):
        """ Processing the SQL query"""
        sql = sql
        self.cursor.execute(sql)
        data = self.cursor.fetchall()
        data = pd.DataFrame(data)
        data.columns = colnames
        return data
    
    def INSERT_Query(self, item):
        sql = 'INSERT INTO UKRI_Funded_ALL_Raw (TITLE, ProRef, OrgName, sDate, eDate, institution, department, projType, PIFirstName, PISurname, Amount, url, Abstract, resSub, resTopic) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);'
        iteminfo = [item['title'][0], item['proRef'][0], item['orgName'][0], item['sDate'][0], item['eDate'][0], item['institution'][0], item['department'][0], item['projType'][0], item["PIFirstName"][0], item["PISurname"][0], item["Amount"][0], item["url"][0], item["abstract"], str(item["resSub"]), str(item['resTopic'])]
        self.cursor.execute(sql, iteminfo)
        return item

    def close_Conn(self):
        """ Closing Connection """
        self.cursor.close()
        self.conn.close()

In [3]:
import string
def remove_punctuation(text):
    """ Removing punctuation """
    try: # python 2.x
        text = text.translate(None, string.punctuation) 
    except: # python 3.x
        translator = text.maketrans('', '', string.punctuation)
        text = text.translate(translator)
    return text

# Data Scraping

In [4]:
ERC = MySQLPipline()
sql = """
SELECT
	ProjectTitle,
	`Year`,
	ProjectAcronym,
	ProjectBudget,
	PI,
	Topic,
	Country,
	`Grant type`,
	Abstract 
FROM
	FundedDataERC
"""
ERCdata = ERC.process_Query(sql, colnames = ["projTitle", "year", "projAcronym", "projBudget", 
                                      "PI", "Topic", "Country", "grantType", "Abstract"])
ERC.close_Conn()

In [5]:
ERCdata

Unnamed: 0,projTitle,year,projAcronym,projBudget,PI,Topic,Country,grantType,Abstract
0,Bioinspired Composites Strategies for Saving E...,Oct-19,BioCom4SavEn,1694375.0,STACHEWICZ Urszula,PE8 - Products & processes engineering,PL,Starting grants,Saving energy together with energy harvesting ...
1,Pragmatics of Multiwinner Voting: Algorithms a...,Feb-20,PRAGMA,1386290.0,FALISZEWSKI Piotr,PE6 - Computer science & informatics,PL,Consolidator grants,This proposal is in the area of computational ...
2,Silicon-Integrated Graphene Photodetectors for...,Apr-18,GRAPH-IC,149750.0,LEMME Max Christian,Proof of Concept,DE,Proof of concept,The goal of the Graph-IC proof of concept prop...
3,Aneurysmal Arterial Mechanics: Into the Structure,Mar-14,AArteMIS,1499783.0,Badel Pierre Joseph,PE8 - Products & processes engineering,FR,Starting grants,"The rupture of an Aortic Aneurysm (AA), which ..."
4,Localization in biomechanics and mechanobiolog...,May-14,BIOLOCHANICS,1999396.0,Avril Stphane Henri Anatole,PE8 - Products & processes engineering,FR,Consolidator grants,"Rupture of Aortic Aneurysms (AA), which kills ..."
...,...,...,...,...,...,...,...,...,...
12389,Functional Blood Analysis for Clinical Applica...,Apr-17,BASIC,150000.0,Guck Jochen,Proof of Concept,DE,Proof of concept,We want to prepare the commercialisation of a ...
12390,"Defect Engineering, Advanced Modelling and Cha...",Oct-18,OptEIon,1980735.0,TRESS Wolfgang,PE4 - Physical & analytical chemical science,CH,Starting grants,Defects in semiconductor materials commonly de...
12391,Social and ethical aspects of digital identiti...,Apr-07,DigIDeas,1833000.0,"Ploeg, Van Der Irma","SH2 - Institutions, values, environment & space",NL,Starting grants,Digital identity management concerns the contr...
12392,Voice Emotion detection by Appraisal Inference,Feb-15,VocEmoApI,149937.5,Scherer Klaus Rainer,Proof of Concept,DE,Proof of concept,The automated sensing of human emotions has ga...


In [7]:
dat = ERCdata[["projTitle", "Topic", "Abstract"]]

In [10]:
pd.options.mode.chained_assignment = None
dat["Abstract"] = dat["Abstract"].apply(remove_punctuation)

In [13]:
dat = dat[~dat.Abstract.isna()]

In [17]:
def noDigits_check(df, text_colnumber):
    emp = []
    for i in range(df.shape[0]):
        s = df.iat[i,text_colnumber]
        if len(re.findall(r'\s+[0-9]+\s+',s)) == 0:
            emp.append(True)
        else:
            emp.append(False)
    return all(emp)

In [22]:
while True:
    dat['Abstract'] = dat.Abstract.apply(lambda s: re.sub(r'\s+[0-9]+\s+', " ", s))
    dat['Abstract'] = dat.Abstract.apply(lambda s: re.sub(r'\s{2,}', " ", s.strip()))
    no_digits = noDigits_check(dat, 2)
    print(no_digits)
    if no_digits:
        break

True


In [29]:
dat.to_csv("../Data/ERC_Cleaned.csv", index=False)