## Clean Technologies - Probabilistic

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [4]:
categories = pd.read_excel('../data/categories.xls')
categories = categories.fillna(method='ffill')
categories.columns = ['sector','technology_group','technology','description','technology_term','source','priority']
categories.head()

Unnamed: 0,sector,technology_group,technology,description,technology_term,source,priority
0,Energy,Solar energy sources,solar PV,Also “solar photovoltaic”; technology of using...,"photovoltaic, PV, “solar cell”",https://www.ctc-n.org/technologies/solar-pv,1
1,Energy,Solar energy sources,concentrated solar power (CSP),Technology of producing electricity by concent...,"“concentrated solar power”, CSP, “concentrated...",https://setis.ec.europa.eu/technologies/concen...,1
2,Energy,Solar energy sources,solar heating,Technology of capturing the sun's radiation an...,“solar heating”,https://www.ctc-n.org/technologies/solar-heating,0
3,Energy,Solar energy sources,solar dryer,"Technology of drying substances, especially fo...",“solar dryer”,https://www.ctc-n.org/technologies/solar-dryer,0
4,Energy,Solar energy sources,solar water pump,Technology of powering electrical water pumps ...,“solar water pump”,https://www.ctc-n.org/technologies/solar-water...,0


In [5]:
matrix = []
for term in categories['technology_term']:
    row = [x.strip() for x in term.split(',')]
    row = [i.replace('“', '').replace('”', '') for i in row]
    matrix.append(row)
categories['technology_term'] = matrix

---

In [6]:
#https://data.europa.eu/euodp/en/data/dataset/cordisH2020projects

In [7]:
cordish2020 = pd.read_excel('../data/cordis-h2020projects.xlsx')
#cordish2020 = pd.read_csv('../data/cordis-h2020projects.csv', sep=";")

In [8]:
cordish2020['totalCost'] = [float(str(i).replace(',', '.')) for i in cordish2020['totalCost']]
cordish2020['ecMaxContribution'] = [float(str(i).replace(',', '.')) for i in cordish2020['ecMaxContribution']]
cordish2020['startDate'] = cordish2020['startDate'].map(pd.Timestamp)
cordish2020['endDate'] = cordish2020['endDate'].map(pd.Timestamp)

In [9]:
cordish2020.columns

Index(['rcn', 'id', 'acronym', 'status', 'programme', 'topics',
       'frameworkProgramme', 'title', 'startDate', 'endDate', 'projectUrl',
       'objective', 'totalCost', 'ecMaxContribution', 'call', 'fundingScheme',
       'coordinator', 'coordinatorCountry', 'participants',
       'participantCountries', 'subjects'],
      dtype='object')

In [10]:
# Total of proyects
len(cordish2020)

24554

In [11]:
# unique list of technology term
technology_term = [y for x in categories['technology_term'] for y in x if y != '']
len(technology_term)

223

In [12]:
all_technology_terms = [i for i in set([*list(categories['technology_group']),*list(technology_term)])]

In [13]:
len(all_technology_terms)

235

In [14]:
all_technology_terms[:10]

['parabolic trough',
 'carbon capture and storage',
 'biorefinery design',
 'conservation tillage',
 'meat alternative',
 'solar water pump',
 'demand side management',
 'Improved durability',
 'renewable energy',
 'Storage']

---
### Syntactic probabilistic classification using spacy and fuzzywuzzy

In [15]:
import spacy
import numpy as np
from spacy import displacy
from collections import Counter
import en_core_web_sm
from fuzzywuzzy import process, fuzz
import re
nlp = en_core_web_sm.load()
import textdistance as tx
import unicodedata
from textdistance.algorithms import vector_based

In [16]:
def clean(string):
    return re.sub(r'[-\s]+', '-',
                str(
                    re.sub(r'[^\w\s-]', '',
                        unicodedata.normalize('NFKD', string)
                    .strip()
                   )))

Objective example

---

In [22]:
#cordish2020.loc[13]['objective']
cordish2020.loc[9631]['objective'][]

'Offshore wind has long been identified as one of the most promising energy forms to improve the penetration of renewables in the European energy mix. Since most of offshore wind resources is available over deep waters at a considerable distance from the shore, it is inevitable that the campaign of the offshore wind exploitation would move from shallow waters to deep waters. As the conventional bottom-fixed offshore wind turbine is no longer economically viable over deep waters (>50m), the floating offshore wind turbine (FOWT) seems to be an appealing alternative to harvest the ampler deep-water wind. FOWTs are, however, threaten by the hostile deep offshore environment, which would induce unacceptable tilt motions and drastic vibrations of the floating system. The undesirable loadings on the blades, tower, floating foundations and other components, results in mechanical failures and electrical faults of FOWTs, both of which could lead to operation interruptions and cause disastrous ec

In [30]:
text = cordish2020['objective'][9631]
doc = nlp(text)
docs = list(map(str, doc.noun_chunks))
docs[:20]

['Offshore wind',
 'the most promising energy forms',
 'the penetration',
 'renewables',
 'the European energy mix',
 'offshore wind resources',
 'deep waters',
 'a considerable distance',
 'the shore',
 'it',
 'the campaign',
 'the offshore wind exploitation',
 'shallow waters',
 'deep waters',
 'the conventional bottom-fixed offshore wind turbine',
 'deep waters',
 'the floating offshore wind turbine',
 'FOWT',
 'an appealing alternative',
 'the ampler deep-water wind']

In [33]:
#find_best_matching_tech(technologies, docs)

In [34]:
#technologies

---

**Get all the synonyms from the short descriptions**

In [27]:
technologies = all_technology_terms

In [61]:
len(technologies)

235

In [26]:
def find_best_matching_tech(techs, doc):
    best_matches = []
    best1 = [("", 0)]
    best2 = [("", 0)]
    val2=""
    best_tech = []
    for val in techs:
        best_curs = process.extract(val, doc, limit=1, scorer=fuzz.ratio)
        terms = ""
        for cat in best_curs:
            terms = terms + cat[0] + "," 
        terms = terms[:-1]
        if len(best_curs)==0:
            avg = 0
        else:
            avg = sum(i for _, i in best_curs)/float(len(best_curs))
        best_cur = [(terms, avg)]
        best_matches.extend(best_cur)
        if best_cur[0][1] > best1[0][1]:
            best1 = best_cur
            best_tech = [val]
        elif best_cur[0][1] == best1[0][1]:
            best1.extend(best_cur)
            best_tech.append(val)
        else:
            if best_cur[0][1] > best2[0][1] and len(best1)<3:
                best2 = best_cur
                val2 = val
    if len(best1)<3:            
        best1.extend(best2)
        best_tech.append(val2)
    return best_matches, best_tech, best1

In [63]:
#process.extract(each tech term, each noun chunk in each objective)

In [64]:
cordish2020['category'] = pd.Series(np.random.randn(len(cordish2020)), index=cordish2020.index)
for idx in range(len(cordish2020['objective'])):
    text = cordish2020['objective'][idx]
    doc = nlp(text)
   # doc = " ".join([token.lemma_ for token in doc])
    #doc = nlp(doc)
    docs = list(map(str, doc.noun_chunks))
    #docs = [clean(str(txt)) for txt in docs]
    _, best_tech, best_match = find_best_matching_tech(technologies, docs)
    #print(best_tech, best_match)
    if round(best_match[0][1]) > 77:
        cordish2020['category'][idx] = " ".join(best_tech)
    else:
        cordish2020['category'][idx] = 'None'

In [65]:
#category is the column to store the best matches with technology

In [66]:
cordish2020['category'].value_counts()['None']

17024

In [67]:
cordish2020matches2 = cordish2020[cordish2020['category'] != "None" ]

In [68]:
print(len(cordish2020matches2)/len(cordish2020))

0.3066710108332655


In [69]:
cordish2020matches2.id[:20]

0     837750
5     835398
6     838845
9     827561
10    823782
12    825435
16    835541
17    833088
22    835051
27    815279
29    810812
32    827826
33    812602
37    822897
40    828666
41    827565
43    811592
45    847641
48    817240
49    826588
Name: id, dtype: int64

In [70]:
cordish2020matches2.head(20)

Unnamed: 0,rcn,id,acronym,status,programme,topics,frameworkProgramme,title,startDate,endDate,...,participantCountries,subjects,matches_group,count_matches_group,count_unique_matches_group,matches_technology,count_matches_technology,count_unique_matches_technology,count_matches,category
0,222681,837750,FARMYNG,SIGNED,H2020-EU.2.1.4.;H2020-EU.3.2.6.,BBI.2018.SO3.F2,H2020,FlAgship demonstration of industrial scale pro...,2019-06-01,2022-06-30,...,FR;NO;BE;ES;PL;CH;DE;NL,,,0,0,,0,0,0,agrosylviculture agro-sylviculture
5,221673,835398,3D-FOGROD,SIGNED,H2020-EU.1.3.2.,MSCA-IF-2018,H2020,Understanding forest growth dynamics using nov...,2019-10-01,2021-09-30,...,,,,0,0,[forest management],1,1,1,reforestation distributed production
6,222088,838845,SPIR,SIGNED,H2020-EU.1.3.2.,MSCA-IF-2018,H2020,Spasers in the infrared range,2020-03-01,2022-02-28,...,,,,0,0,,0,0,0,nanomaterial reforestation
9,217264,827561,UFine,CLOSED,H2020-EU.3.;H2020-EU.2.3.;H2020-EU.2.1.,EIC-SMEInst-2018-2020,H2020,An innovative ultra-fine bubble engineered noz...,2018-09-01,2018-12-31,...,,,,0,0,,0,0,0,smelt reduction electric battery
10,220938,823782,SSHOC,SIGNED,H2020-EU.1.4.1.1.,INFRAEOSC-04-2018,H2020,Social Sciences & Humanities Open Cloud,2019-01-01,2022-04-30,...,UK;NL;FR;EL;AT;IT;DE,,,0,0,,0,0,0,System innovation Energy use innovation
12,221878,825435,DECOMPACT,SIGNED,H2020-EU.1.1.,ERC-2018-PoC,H2020,Development of Collagenase Polymeric nanocapsu...,2019-05-01,2020-10-31,...,,,,0,0,,0,0,0,soil treatment pump system
16,221633,835541,MOVES,SIGNED,H2020-EU.1.3.2.,MSCA-IF-2018,H2020,MOnitoring VEgetation status and functioning a...,2019-10-01,2021-09-30,...,,,,0,0,[forest management],1,1,1,forest management Waste management
17,222602,833088,InfraStress,SIGNED,H2020-EU.3.7.4.;H2020-EU.3.7.2.,SU-INFRA01-2018-2019-2020,H2020,Improving resilience of sensitive industrial p...,2019-06-01,2021-05-31,...,SI;CY;PT;EL;DE;IE;IT;FR;IL;PL;BE,,,0,0,,0,0,0,SPS Wind
22,221963,835051,NucSat,SIGNED,H2020-EU.1.3.2.,MSCA-IF-2018,H2020,Satellites and nuclear information. Production...,2019-07-01,2021-06-30,...,,,,0,0,,0,0,0,soil management grid management waste management
27,218529,815279,5G-VINNI,SIGNED,H2020-EU.2.1.1.,ICT-17-2018,H2020,5G Verticals INNovation Infrastructure,2018-07-01,2021-06-30,...,LU;DE;NO;EL;PT;IE;ES;DK;FI;UK;IT,,,0,0,,0,0,0,soil management grid management waste management


In [None]:
text = cordish2020['objective'][13]
doc = nlp(text)
#doc = " ".join([token.lemma_ for token in doc])
#doc = nlp(doc)
docs = list(map(str, doc.noun_chunks))
_, best_tech, best_match = find_best_matching_tech(technologies, docs)
print(best_tech,best_match)

In [72]:
cordish2020matches2[['id','acronym','title','objective','startDate','endDate','ecMaxContribution','matches_group','matches_technology','count_matches','category']].to_csv('../data/tech2.csv')