# CleanTechTag 
## (Clean Technologies Tagging) - Probabilistic Model
---

### 1. Gathering the mitigation technology catalogue

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [2]:
categories = pd.read_excel('../data/mitigation-technology-catalogue.xls')
categories = categories.fillna(method='ffill')
categories.columns = ['sector','technology_group','technology','description','technology_term','source','priority']
categories.head()

Unnamed: 0,sector,technology_group,technology,description,technology_term,source,priority
0,Energy,Solar energy sources,solar PV,Also “solar photovoltaic”; technology of using...,"photovoltaic, PV, “solar cell”",https://www.ctc-n.org/technologies/solar-pv,1
1,Energy,Solar energy sources,concentrated solar power (CSP),Technology of producing electricity by concent...,"“concentrated solar power”, CSP, “concentrated...",https://setis.ec.europa.eu/technologies/concen...,1
2,Energy,Solar energy sources,solar heating,Technology of capturing the sun's radiation an...,“solar heating”,https://www.ctc-n.org/technologies/solar-heating,1
3,Energy,Solar energy sources,solar dryer,"Technology of drying substances, especially fo...",“solar dryer”,https://www.ctc-n.org/technologies/solar-dryer,0
4,Energy,Solar energy sources,solar water pump,Technology of powering electrical water pumps ...,“solar water pump”,https://www.ctc-n.org/technologies/solar-water...,0


In [3]:
matrix = []
for term in categories['technology_term']:
    row = [x.strip() for x in term.split(',')]
    row = [i.replace('“', '').replace('”', '') for i in row]
    matrix.append(row)
categories['technology_term'] = matrix

---

### 2. Reading up cordis H2020 projects data source
https://data.europa.eu/euodp/en/data/dataset/cordisH2020projects

*To replicate the results use the copy made*

In [None]:
cordish2020 = pd.read_excel('../data/cordis-h2020projects.xlsx')

*To get the latest data go to the source endpoint*

In [4]:
#cordish2020 = pd.read_csv('https://cordis.europa.eu/data/cordis-h2020projects.csv', sep=';',error_bad_lines=False)

In [5]:
cordish2020['totalCost'] = [float(str(i).replace(',', '.')) for i in cordish2020['totalCost']]
cordish2020['ecMaxContribution'] = [float(str(i).replace(',', '.')) for i in cordish2020['ecMaxContribution']]
cordish2020['objective'] = [str(i) for i in cordish2020['objective']]
cordish2020['startDate'] = cordish2020['startDate'].map(pd.Timestamp)
cordish2020['endDate'] = cordish2020['endDate'].map(pd.Timestamp)

In [6]:
cordish2020.columns

Index(['rcn', 'id', 'acronym', 'status', 'programme', 'topics',
       'frameworkProgramme', 'title', 'startDate', 'endDate', 'projectUrl',
       'objective', 'totalCost', 'ecMaxContribution', 'call', 'fundingScheme',
       'coordinator', 'coordinatorCountry', 'participants',
       'participantCountries', 'subjects'],
      dtype='object')

In [7]:
# Total of proyects
len(cordish2020)

27370

In [8]:
# unique list of technology term
technology_term = [y for x in categories['technology_term'] for y in x if y != '']
len(technology_term)

237

#### Technology terms example

In [9]:
all_technology_terms = [i for i in set([*list(categories['technology_group']),*list(technology_term)])]
all_technology_terms[:10]

['PV',
 'traffic management',
 'management of livestock',
 'recycling',
 'straw management',
 'conservation tillage',
 'alternate wetting and drying',
 'treatment of soil',
 'blast furnace slag',
 'Cleaner product']

---

### 3. Syntactic probabilistic text labeling
#### Using [SpaCy](http://spacy.io/) and [FuzzyWuzzy](https://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ ) string matching algorithm.

In [10]:
import spacy
import numpy as np
from spacy import displacy
from collections import Counter
import en_core_web_sm
from fuzzywuzzy import process, fuzz
import re
import textdistance as tx
import unicodedata
from textdistance.algorithms import vector_based
nlp = en_core_web_sm.load()

In [11]:
def clean(string):
    return re.sub(r'[-\s]+', '-',
                str(
                    re.sub(r'[^\w\s-]', '',
                        unicodedata.normalize('NFKD', string)
                    .strip()
                   )))

#### Get all the synonyms from the short descriptions

In [12]:
technologies = all_technology_terms
len(technologies)

249

In [13]:
def find_best_matching_tech(techs, doc):
    best_matches = []
    best1 = [("", 0)]
    best2 = [("", 0)]
    val2=""
    best_tech = []
    for val in techs:
        best_curs = process.extract(val, doc, limit=1, scorer=fuzz.ratio)
        terms = ""
        for cat in best_curs:
            terms = terms + cat[0] + "," 
        terms = terms[:-1]
        if len(best_curs)==0:
            avg = 0
        else:
            avg = sum(i for _, i in best_curs)/float(len(best_curs))
        best_cur = [(terms, avg)]
        best_matches.extend(best_cur)
        if best_cur[0][1] > best1[0][1]:
            best1 = best_cur
            best_tech = [val]
        elif best_cur[0][1] == best1[0][1]:
            best1.extend(best_cur)
            best_tech.append(val)
        else:
            if best_cur[0][1] > best2[0][1] and len(best1)<3:
                best2 = best_cur
                val2 = val
    if len(best1)<3:            
        best1.extend(best2)
        best_tech.append(val2)
    return best_matches, best_tech, best1

In [14]:
#process.extract(each tech term, each noun chunk in each objective)

In [15]:
cordish2020['category'] = pd.Series(np.random.randn(len(cordish2020)), index=cordish2020.index)
for idx in range(len(cordish2020['objective'])):
    text = cordish2020['objective'][idx]
    doc = nlp(text)
   # doc = " ".join([token.lemma_ for token in doc])
    #doc = nlp(doc)
    docs = list(map(str, doc.noun_chunks))
    #docs = [clean(str(txt)) for txt in docs]
    _, best_tech, best_match = find_best_matching_tech(technologies, docs)
    #print(best_tech, best_match)
    if round(best_match[0][1]) > 77:
        cordish2020['category'][idx] = " ".join(best_tech)
    else:
        cordish2020['category'][idx] = 'None'

In [16]:
#category is the column to store the best matches with technology

---

### 4. Matches analysis

In [18]:
#cordish2020['category'].value_counts()['None']
cordish2020matches2 = cordish2020[cordish2020['category'] != "None" ]

#### Percentage of Matches

In [19]:
print(100*len(cordish2020matches2)/len(cordish2020))

0.31114358786993057


In [20]:
cordish2020matches2.id[:20]

2     801338
6     875629
9     766466
18    866510
19    863664
25    854796
27    871869
30    871518
32    870245
34    885214
35    886231
38    882623
39    887865
40    889249
46    871403
47    851374
50    888926
53    888396
54    889000
56    889258
Name: id, dtype: int64

In [23]:
cordish2020matches2.head(10)

Unnamed: 0,rcn,id,acronym,status,programme,topics,frameworkProgramme,title,startDate,endDate,...,totalCost,ecMaxContribution,call,fundingScheme,coordinator,coordinatorCountry,participants,participantCountries,subjects,category
2,216332,801338,VES4US,SIGNED,H2020-EU.1.2.1.,FETOPEN-01-2016-2017,H2020,Extracellular vesicles from a natural source f...,2018-09-01,2021-08-31,...,2946303.75,2946303.75,H2020-FETOPEN-1-2016-2017,RIA,CONSIGLIO NAZIONALE DELLE RICERCHE,IT,"ZABALA INNOVATION CONSULTING, S.A.;INSTITUTE O...",ES;IE;CH;DE;SI,,LNG RAPS
6,225983,875629,NAIMA,SIGNED,H2020-EU.2.1.3.;H2020-EU.2.1.2.,LC-BAT-2-2019,H2020,NA ION MATERIALS AS ESSENTIAL COMPONENTS TO MA...,2019-12-01,2022-11-30,...,7999897.03,7999897.03,H2020-LC-BAT-2019,RIA,TIAMAT,FR,BIOKOL SVERIGE AB;VLAAMSE INSTELLING VOOR TECH...,SE;BE;BG;ES;FR;NL;DE;SI,,Other renewable energy sources tidal energy
9,211662,766466,INDEX,SIGNED,H2020-EU.1.2.1.,FETOPEN-01-2016-2017,H2020,Integrated nanoparticle isolation and detectio...,2017-10-01,2020-09-30,...,2983525.0,2982275.0,H2020-FETOPEN-1-2016-2017,RIA,CONSIGLIO NAZIONALE DELLE RICERCHE,IT,FLUIGENT SA;THE TRUSTEES OF BOSTON UNIVERSITY;...,FR;US;DK;EE,,CHP smelt reduction
18,227213,866510,TRANSLATIONAL,SIGNED,H2020-EU.1.1.,ERC-2019-COG,H2020,A new translational strategy for tailored trea...,2020-10-01,2025-09-30,...,1957230.0,1957230.0,ERC-2019-COG,ERC-COG,GOETEBORGS UNIVERSITET,SE,,,,soil management grid management Waste management
19,227212,863664,ExpoBiome,SIGNED,H2020-EU.1.1.,ERC-2019-COG,H2020,Deciphering the impact of exposures from the g...,2020-05-01,2025-04-30,...,1998620.0,1998620.0,ERC-2019-COG,ERC-COG,UNIVERSITE DU LUXEMBOURG,LU,,,,CHP CAES
25,227210,854796,SURE,SIGNED,H2020-EU.1.1.,ERC-2019-SyG,H2020,3-D Super resolution Ultrasound Real time imag...,2020-03-01,2026-02-28,...,9980899.0,9980899.0,ERC-2019-SyG,ERC-SyG,DANMARKS TEKNISKE UNIVERSITET,DK,REGION HOVEDSTADEN;KOBENHAVNS UNIVERSITET,DK,,soil treatment CAES
27,227218,871869,B-HUB FOR EUROPE,SIGNED,H2020-EU.2.1.1.,ICT-33-2019,H2020,Blockchain HUB FOR EUROPEan startups accelerat...,2020-01-01,2021-12-31,...,1898375.0,1642995.13,H2020-ICT-2019-2,IA,INNOVA SRL,IT,ASOCIATA SPHERIK;SYSTEM@TIC PARIS REGION;VSI S...,RO;FR;LT;DE,,CAES virtual collaboration platform
30,226886,871518,COLLABS,SIGNED,H2020-EU.2.1.1.,ICT-08-2019,H2020,A COmprehensive cyber-intelligence framework f...,2020-01-01,2022-12-31,...,5999643.75,5999643.75,H2020-ICT-2019-2,RIA,COMMISSARIAT A L ENERGIE ATOMIQUE ET AUX ENERG...,FR,RENAULT SAS;CHAROKOPEIO PANEPISTIMIO;INFORMATI...,FR;EL;RS;DE;NL;IT;CH,,robotics manufacturing System innovation
32,226246,870245,GEOCEP,SIGNED,H2020-EU.1.3.3.,MSCA-RISE-2019,H2020,Global Excellence in Modeling Climate and Ener...,2020-10-01,2024-09-30,...,3036000.0,2254000.0,H2020-MSCA-RISE-2019,MSCA-RISE,UNIVERZITA KARLOVA,CZ,LONDON SCHOOL OF ECONOMICS AND POLITICAL SCIEN...,UK;CH;IT;FR,,renewable energy efficient energy transmission
34,226354,885214,XpeFundus,SIGNED,H2020-EU.3.;H2020-EU.2.3.;H2020-EU.2.1.,EIC-SMEInst-2018-2020,H2020,"A smart hyperspectral system for early, non-in...",2020-01-01,2020-04-30,...,71429.0,50000.0,H2020-SMEInst-2018-2020-1,SME-1,"XPECTRALTEK, LDA",PT,,,,agrosylviculture soil treatment


In [25]:
cordish2020matches2.columns

Index(['rcn', 'id', 'acronym', 'status', 'programme', 'topics',
       'frameworkProgramme', 'title', 'startDate', 'endDate', 'projectUrl',
       'objective', 'totalCost', 'ecMaxContribution', 'call', 'fundingScheme',
       'coordinator', 'coordinatorCountry', 'participants',
       'participantCountries', 'subjects', 'category'],
      dtype='object')

In [27]:
cordish2020matches2[['id','acronym','title',
                     'objective','startDate',
                     'endDate','ecMaxContribution','category']]\
    .to_csv('../data/cleantechtag_probabilistic_results.csv')

#### Noun chunks split from project's objective example

In [28]:
cordish2020.loc[14]['objective']

'LASERLAB-EUROPE is the European consortium of major national laser research infrastructures, covering advanced laser science and applications in most domains of research and technology, with particular emphasis on areas with high industrial and social impact, such as bio- and nanophotonics, material analyses, biology and medicine. \nRecently the field of advanced lasers has experienced remarkable advances and breakthroughs in laser technologies and novel applications. Laser technology is a key innovation driver for highly varied applications and products in many areas of modern society, thereby substantially contributing to economic growth. Through its strategic approach, LASERLAB-EUROPE aims to strengthen Europe’s leading position and competitiveness in this key area. It facilitates the coordination of laser research activities within the European Research Area by integrating major facilities in most European member states with a long-term perspective and providing concerted and effi

In [32]:
text = cordish2020.loc[18]['objective']
doc = nlp(text)
docs = list(map(str, doc.noun_chunks))
docs[:20]

['Type',
 '2 diabetes',
 'T2D',
 'an escalating health problem',
 'enormous proportions',
 'Current treatment strategies',
 'disease progression',
 'the devastating complications',
 'Clinical guidelines',
 'the need',
 'personalized treatment',
 'trial-and-error fashion',
 'We',
 'T2D patients',
 'four clusters',
 'different characteristics',
 'a major step',
 'the high variability',
 'the pathophysiology',
 'us']

In [33]:
_, best_tech, best_match = find_best_matching_tech(technologies, docs)
print(best_tech,best_match)

['soil management', 'grid management', 'Waste management'] [('management', 80.0), ('management', 80.0), ('management', 77.0)]
